From d2f05cd30fcd530655649549452c34dea1969281 Mon Sep 17 00:00:00 2001
From: Arisotura <thetotalworm@gmail.com>
Date: Fri, 9 Aug 2019 14:19:13 +0200
Subject: prepare JIT beta branch

---
 src/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/version.h b/src/version.h
index 6250601..9084606 100644
--- a/src/version.h
+++ b/src/version.h
@@ -19,7 +19,7 @@
 #ifndef VERSION_H
 #define VERSION_H
 
-#define MELONDS_VERSION    "0.8.3"
+#define MELONDS_VERSION    "0.8.3-JIT"
 
 #define MELONDS_URL        "http://melonds.kuribo64.net/"
 
-- 
cgit v1.2.3


From c5c342c0091d9bf36500950a21585c5c98dd7d9d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 22 Jun 2019 01:28:32 +0200
Subject: JIT: base all instructions are interpreted

---
 src/ARM.cpp                        |   13 +-
 src/ARMJIT.cpp                     |  177 ++
 src/ARMJIT.h                       |  140 ++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  332 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   54 +
 src/ARM_InstrInfo.cpp              |  376 ++++
 src/ARM_InstrInfo.h                |  232 +++
 src/CMakeLists.txt                 |   12 +
 src/CP15.cpp                       |    7 +
 src/NDS.cpp                        |   17 +
 src/dolphin/Assert.h               |   47 +
 src/dolphin/BitSet.h               |  218 +++
 src/dolphin/CPUDetect.h            |   76 +
 src/dolphin/CodeBlock.h            |  121 ++
 src/dolphin/CommonFuncs.cpp        |   52 +
 src/dolphin/CommonFuncs.h          |   58 +
 src/dolphin/Intrinsics.h           |   72 +
 src/dolphin/Log.h                  |   20 +
 src/dolphin/MemoryUtil.cpp         |  193 ++
 src/dolphin/MemoryUtil.h           |   22 +
 src/dolphin/license_dolphin.txt    |  339 ++++
 src/dolphin/x64ABI.cpp             |  119 ++
 src/dolphin/x64ABI.h               |   57 +
 src/dolphin/x64CPUDetect.cpp       |  274 +++
 src/dolphin/x64Emitter.cpp         | 3398 ++++++++++++++++++++++++++++++++++++
 src/dolphin/x64Emitter.h           | 1180 +++++++++++++
 src/dolphin/x64Reg.h               |   96 +
 src/libui_sdl/DlgEmuSettings.cpp   |   45 +-
 28 files changed, 7743 insertions(+), 4 deletions(-)
 create mode 100644 src/ARMJIT.cpp
 create mode 100644 src/ARMJIT.h
 create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.h
 create mode 100644 src/ARM_InstrInfo.cpp
 create mode 100644 src/ARM_InstrInfo.h
 create mode 100644 src/dolphin/Assert.h
 create mode 100644 src/dolphin/BitSet.h
 create mode 100644 src/dolphin/CPUDetect.h
 create mode 100644 src/dolphin/CodeBlock.h
 create mode 100644 src/dolphin/CommonFuncs.cpp
 create mode 100644 src/dolphin/CommonFuncs.h
 create mode 100644 src/dolphin/Intrinsics.h
 create mode 100644 src/dolphin/Log.h
 create mode 100644 src/dolphin/MemoryUtil.cpp
 create mode 100644 src/dolphin/MemoryUtil.h
 create mode 100644 src/dolphin/license_dolphin.txt
 create mode 100644 src/dolphin/x64ABI.cpp
 create mode 100644 src/dolphin/x64ABI.h
 create mode 100644 src/dolphin/x64CPUDetect.cpp
 create mode 100644 src/dolphin/x64Emitter.cpp
 create mode 100644 src/dolphin/x64Emitter.h
 create mode 100644 src/dolphin/x64Reg.h

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 6248de2..b709277 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,6 +21,7 @@
 #include "ARM.h"
 #include "ARMInterpreter.h"
 #include "AREngine.h"
+#include "ARMJIT.h"
 
 
 // instruction timing notes
@@ -481,7 +482,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        if (CPSR & 0x20) // THUMB
+        /*if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -514,7 +515,15 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }
+        }*/
+
+        if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4));
+        if (block == NULL)
+            block = ARMJIT::CompileBlock(this);
+        Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
new file mode 100644
index 0000000..489cdcf
--- /dev/null
+++ b/src/ARMJIT.cpp
@@ -0,0 +1,177 @@
+#include "ARMJIT.h"
+
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+
+namespace ARMJIT
+{
+
+Compiler* compiler;
+BlockCache cache;
+
+
+#define DUP2(x) x, x
+
+static ptrdiff_t JIT_MEM[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
+		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
+		/* 4X*/	DUP2(-1),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/		 -1, 
+					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
+		/* 1X*/	DUP2(-1),
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	     offsetof(BlockCache, SWRAM),
+		             offsetof(BlockCache, ARM7_WRAM),
+		/* 4X*/	     -1,
+		             offsetof(BlockCache, ARM7_WIRAM),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(-1)
+		}
+};
+
+static u32 JIT_MASK[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(0x00007FFF),
+		/* 1X*/	DUP2(0x00007FFF),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	DUP2(0x00007FFF),
+		/* 4X*/	DUP2(0x00000000),
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/		 0x00000000,
+					 0x000FFFFF,
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00007FFF)
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(0x00003FFF),
+		/* 1X*/	DUP2(0x00000000),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	     0x00007FFF,
+		             0x0000FFFF,
+		/* 4X*/	     0x00000000,
+		             0x0000FFFF,
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/ DUP2(0x0003FFFF),
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00000000)
+		}
+};
+
+#undef DUP2
+
+
+void Init()
+{
+    memset(&cache, 0, sizeof(BlockCache));
+
+    for (int cpu = 0; cpu < 2; cpu++)
+        for (int i = 0; i < 0x4000; i++)
+            cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL :
+				(CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9])
+                + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1);
+
+	compiler = new Compiler();
+}
+
+void DeInit()
+{
+	delete compiler;
+}
+
+CompiledBlock CompileBlock(ARM* cpu)
+{
+    bool thumb = cpu->CPSR & 0x20;
+
+    FetchedInstr instrs[12];
+    int i = 0;
+    u32 r15 = cpu->R[15];
+    u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+    //printf("block %x %d\n", r15, thumb);
+    do
+    {
+        r15 += thumb ? 2 : 4;
+
+        instrs[i].Instr = nextInstr[0];
+        //printf("%x %x\n", instrs[i].Instr, r15);
+        instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+
+        if (cpu->Num == 0)
+        {
+            ARMv5* cpuv5 = (ARMv5*)cpu;
+            if (thumb && r15 & 0x2)
+            {
+                nextInstr[1] >>= 16;
+                instrs[i].CodeCycles = 0;
+            }
+            else
+            {
+                nextInstr[1] = cpuv5->CodeRead32(r15, false);
+                instrs[i].CodeCycles = cpu->CodeCycles;
+            }
+        }
+        else
+        {
+            ARMv4* cpuv4 = (ARMv4*)cpu;
+            if (thumb)
+                nextInstr[1] = cpuv4->CodeRead16(r15);
+            else
+                nextInstr[1] = cpuv4->CodeRead32(r15);
+            instrs[i].CodeCycles = cpu->CodeCycles;
+        }
+        instrs[i].NextInstr[1] = nextInstr[1];
+        instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
+
+        i++;
+    } while(!instrs[i - 1].Info.Branches() && i < 10);
+
+    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+
+    InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block);
+
+    return block;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
new file mode 100644
index 0000000..d718295
--- /dev/null
+++ b/src/ARMJIT.h
@@ -0,0 +1,140 @@
+#ifndef ARMJIT_H
+#define ARMJIT_H
+
+#include "types.h"
+
+#include <string.h>
+
+#include "ARM.h"
+#include "ARM_InstrInfo.h"
+
+namespace ARMJIT
+{
+
+typedef u32 (*CompiledBlock)();
+
+class RegCache
+{
+
+static const int NativeRegAllocOrder[];
+static const int NativeRegsCount;
+
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+    u32 Instr;
+    u32 NextInstr[2];
+
+    u8 CodeCycles;
+
+    ARMInstrInfo::Info Info;
+};
+
+/* 
+	Copied from DeSmuME
+	Some names where changed to match the nomenclature of melonDS
+
+	Since it's nowhere explained and atleast I needed some time to get behind it,
+	here's a summary on how it works:
+		more or less all memory locations from which code can be executed are
+		represented by an array of function pointers, which point to null or
+		a function which executes a block instructions starting from there.
+
+		The most significant 4 bits of each address is ignored. This 28 bit space is
+		divided into 0x4000 16 KB blocks, each of which a pointer to the relevant
+		place inside the before mentioned arrays. Only half of the bytes need to be
+		addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary).
+
+		In case a memory write hits mapped memory, the function block at this
+		address is set to null, so it's recompiled the next time it's executed.
+
+		This method has disadvantages, namely that only writing to the
+		first instruction of a block marks it as invalid and that memory remapping
+        (SWRAM and VRAM) isn't taken into account.
+*/
+
+struct BlockCache
+{
+    CompiledBlock* AddrMapping[2][0x4000] = {0};
+
+    CompiledBlock MainRAM[16*1024*1024/2];
+	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
+	CompiledBlock ARM9_ITCM[0x8000/2];
+	CompiledBlock ARM9_LCDC[0xA4000/2];
+	CompiledBlock ARM9_BIOS[0x8000/2];
+	CompiledBlock ARM7_BIOS[0x4000/2];
+	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
+	CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi
+	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
+};
+
+extern BlockCache cache;
+
+inline bool IsMapped(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+}
+
+inline CompiledBlock LookUpBlock(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+}
+
+inline void Invalidate16(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+		cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+}
+
+inline void Invalidate32(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+	{
+		CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+		page[(addr & 0x3FFF) >> 1] = NULL;
+		page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+	}
+}
+
+inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
+{
+	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+}
+
+inline void ResetBlocks()
+{
+	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
+	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
+	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
+	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
+	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
+	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
+	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
+	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
+	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+}
+
+void Init();
+void DeInit();
+
+CompiledBlock CompileBlock(ARM* cpu);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..fb2fda8
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -0,0 +1,332 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include <assert.h>
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13};
+const int RegCache::NativeRegsCount = 5;
+
+Compiler::Compiler()
+{
+    AllocCodeSpace(1024 * 1024 * 4);
+}
+
+typedef void (Compiler::*CompileFunc)();
+typedef void (*InterpretFunc)(ARM*);
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+
+    MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
+}
+
+void Compiler::SaveCPSR()
+{
+    if (CPSRDirty)
+    {
+        MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
+        CPSRDirty = false;
+    }
+}
+
+CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+{
+    if (IsAlmostFull())
+    {
+        ResetBlocks();
+        ResetCodePtr();
+    }
+
+    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
+
+    ConstantCycles = 0;
+    Thumb = cpu->CPSR & 0x20;
+    Num = cpu->Num;
+    R15 = cpu->R[15];
+
+    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+
+    MOV(64, R(RCPU), ImmPtr(cpu));
+    XOR(32, R(RCycles), R(RCycles));
+
+    LoadCPSR();
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        R15 += Thumb ? 2 : 4;
+        CurrentInstr = instrs[i];
+
+        CompileFunc comp = NULL;
+
+        if (comp == NULL || i == instrsCount - 1)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr));
+            if (i == instrsCount - 1)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1]));
+            }
+
+            SaveCPSR();
+        }
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF;
+                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+            }
+            else
+            {
+            }
+        }
+        else
+        {
+            u32 cond = CurrentInstr.Cond();
+            if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+                ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+            }
+            else if (cond == 0xF)
+                AddCycles_C();
+            else
+            {
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                {
+                    if (cond >= 0x8)
+                    {
+                        static_assert(RSCRATCH3 == ECX);
+                        MOV(32, R(RSCRATCH3), R(RCPSR));
+                        SHR(32, R(RSCRATCH3), Imm8(28));
+                        MOV(32, R(RSCRATCH), Imm32(1));
+                        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+                        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+                    
+                        skipExecute = J_CC(CC_Z);
+                    }
+                    else
+                    {
+                        // could have used a LUT, but then where would be the fun?
+                        BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
+                        
+                        skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
+                    }
+                    
+                }
+
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                    u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0);
+                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                }
+                else
+                {
+                }
+
+                FixupBranch skipFailed;
+                if (CurrentInstr.Cond() < 0xE)
+                {
+                    skipFailed = J();
+                    SetJumpTarget(skipExecute);
+
+                    AddCycles_C();
+
+                    SetJumpTarget(skipFailed);
+                }
+            }
+        }
+
+        /*
+            we don't need to collect the interpreted cycles,
+            since all functions only add to it, the dispatcher
+            can take care of it.
+        */
+
+        if (comp == NULL && i != instrsCount - 1)
+            LoadCPSR();
+    }
+
+    SaveCPSR();
+
+    LEA(32, RAX, MDisp(RCycles, ConstantCycles));
+
+    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    RET();
+
+    return res;
+}
+
+void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
+{
+    const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+    {
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    };
+
+    const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL
+    };
+}
+
+void Compiler::AddCycles_C()
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles);
+
+    if (CurrentInstr.Cond() < 0xE)
+        ADD(32, R(RCycles), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+        case 0: // LSL
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHL(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+
+                return R(RSCRATCH);
+            }
+            else
+            {
+                carryUsed = false;
+                return R(rm);
+            }
+        case 1: // LSR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHR(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+                return R(RSCRATCH);
+            }
+            else
+            {
+                if (S)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                return Imm32(0);
+            }
+        case 2: // ASR
+            MOV(32, R(RSCRATCH), R(rm));
+            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            if (S)
+            {
+                if (amount == 0)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                else
+                    SETcc(CC_C, R(RSCRATCH2));
+            }
+            return R(RSCRATCH);
+        case 3: // ROR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                ROR_(32, R(RSCRATCH), Imm8(amount));
+            }
+            else
+            {
+                BT(32, R(RCPSR), Imm8(29));
+                MOV(32, R(RSCRATCH), R(rm));
+                RCR(32, R(RSCRATCH), Imm8(1));
+            }
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+    }
+}
+
+void Compiler::A_Comp_ALU(const FetchedInstr& instr)
+{
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..8e1d100
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -0,0 +1,54 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../dolphin/x64Emitter.h"
+
+#include "../ARMJIT.h"
+
+
+namespace ARMJIT
+{
+
+const Gen::X64Reg RCPU = Gen::RBP;
+const Gen::X64Reg RCycles = Gen::R14;
+const Gen::X64Reg RCPSR = Gen::R15;
+
+const Gen::X64Reg RSCRATCH = Gen::EAX;
+const Gen::X64Reg RSCRATCH2 = Gen::EDX;
+const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+
+class Compiler : public Gen::X64CodeBlock
+{
+public:
+    Compiler();
+
+    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+
+    void StartBlock(ARM* cpu);
+    CompiledBlock FinaliseBlock();
+
+    void Compile(RegCache& regs, const FetchedInstr& instr);
+private:
+    void AddCycles_C();
+
+    Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed);
+
+    void A_Comp_ALU(const FetchedInstr& instr);
+
+    void LoadCPSR();
+    void SaveCPSR();
+
+    bool CPSRDirty = false;
+
+    FetchedInstr CurrentInstr;
+
+    bool Thumb;
+    u32 Num;
+    u32 R15;
+
+    u32 ConstantCycles;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
new file mode 100644
index 0000000..41c46e1
--- /dev/null
+++ b/src/ARM_InstrInfo.cpp
@@ -0,0 +1,376 @@
+#include "ARM_InstrInfo.h"
+
+#include <stdio.h>
+
+namespace ARMInstrInfo
+{
+
+#define ak(x) ((x) << 13)
+
+enum {
+    A_Read0             = 1 << 0,
+    A_Read16            = 1 << 1,
+    A_Read8             = 1 << 2,
+    A_Read12            = 1 << 3,
+
+    A_Write12           = 1 << 4,
+    A_Write16           = 1 << 5,
+    A_MemWriteback      = 1 << 6,
+
+    A_BranchAlways      = 1 << 7,
+
+    // for STRD/LDRD
+    A_Read12Double      = 1 << 8,
+    A_Write12Double     = 1 << 9,
+
+    A_Link              = 1 << 10,
+
+    A_LDMSTM            = 1 << 11,
+
+    A_ARM9Only          = 1 << 12,
+};
+
+#define A_BIOP A_Read16
+#define A_MONOOP 0
+
+#define A_IMPLEMENT_ALU_OP(x,k) \
+    const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+    \
+    const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP)
+A_IMPLEMENT_ALU_OP(EOR,BIOP)
+A_IMPLEMENT_ALU_OP(SUB,BIOP)
+A_IMPLEMENT_ALU_OP(RSB,BIOP)
+A_IMPLEMENT_ALU_OP(ADD,BIOP)
+A_IMPLEMENT_ALU_OP(ADC,BIOP)
+A_IMPLEMENT_ALU_OP(SBC,BIOP)
+A_IMPLEMENT_ALU_OP(RSC,BIOP)
+A_IMPLEMENT_ALU_OP(ORR,BIOP)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP)
+A_IMPLEMENT_ALU_OP(BIC,BIOP)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP)
+
+const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
+
+#define A_IMPLEMENT_ALU_TEST(x) \
+    const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST)
+A_IMPLEMENT_ALU_TEST(TEQ)
+A_IMPLEMENT_ALU_TEST(CMP)
+A_IMPLEMENT_ALU_TEST(CMN)
+
+const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
+const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
+const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
+const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
+
+const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ);
+
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB);
+
+#define A_LDR A_Write12
+#define A_STR A_Read12
+
+#define A_IMPLEMENT_WB_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
+    const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
+    const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
+    const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
+    const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+
+A_IMPLEMENT_WB_LDRSTR(STR,STR)
+A_IMPLEMENT_WB_LDRSTR(STRB,STR)
+A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
+A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
+
+#define A_LDRD A_Write12Double
+#define A_STRD A_Read12Double
+
+#define A_IMPLEMENT_HD_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
+
+A_IMPLEMENT_HD_LDRSTR(STRH,STR)
+A_IMPLEMENT_HD_LDRSTR(LDRD,LDRD)
+A_IMPLEMENT_HD_LDRSTR(STRD,STRD)
+A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
+
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
+
+const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM);
+
+const u32 A_B = A_BranchAlways | ak(ak_B);
+const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
+const u32 A_BLX_IMM = A_BranchAlways | A_Link | ak(ak_BLX_IMM);
+const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
+const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
+
+const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
+const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC);
+const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
+
+// THUMB
+
+#define tk(x) ((x) << 16)
+
+enum {
+    T_Read0         = 1 << 0,
+    T_Read3         = 1 << 1,
+    T_Read6         = 1 << 2,
+    T_Read8         = 1 << 3,
+
+    T_Write0        = 1 << 4,
+    T_Write8        = 1 << 5,
+
+    T_ReadHi0       = 1 << 6,
+    T_ReadHi3       = 1 << 7,
+    T_WriteHi0      = 1 << 8,
+
+    T_ReadR13       = 1 << 9,
+    T_WriteR13      = 1 << 10,
+    T_ReadR15       = 1 << 11,
+
+    T_BranchAlways  = 1 << 12,
+    T_ReadR14       = 1 << 13,
+    T_WriteR14      = 1 << 14,
+
+    T_PopPC         = 1 << 15
+};
+
+const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG);
+
+const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
+const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
+
+const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL);
+const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
+const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
+
+const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
+
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
+
+const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
+
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
+
+const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
+
+const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+
+const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
+const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
+const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_B = T_BranchAlways | tk(tk_B);
+const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
+
+const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC);
+
+#define INSTRFUNC_PROTO(x) u32 x
+#include "ARM_InstrTable.h"
+#undef INSTRFUNC_PROTO
+
+Info Decode(bool thumb, u32 num, u32 instr)
+{
+    Info res = {0};
+    if (thumb)
+    {
+        u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+
+        if (data & T_Read0)
+            res.SrcRegs |= 1 << (instr & 0x7);
+        if (data & T_Read3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0x7);
+        if (data & T_Read6)
+            res.SrcRegs |= 1 << ((instr >> 6) & 0x7);
+        if (data & T_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0x7);
+
+        if (data & T_Write0)
+            res.DstRegs |= 1 << (instr & 0x7);
+        if (data & T_Write8)
+            res.DstRegs |= 1 << ((instr >> 8) & 0x7);
+        
+        if (data & T_ReadHi0)
+            res.SrcRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+        if (data & T_ReadHi3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0xF);
+        if (data & T_WriteHi0)
+            res.DstRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+
+        if (data & T_ReadR13)
+            res.SrcRegs |= (1 << 13);
+        if (data & T_WriteR13)
+            res.DstRegs |= (1 << 13);
+        if (data & T_ReadR15)
+            res.SrcRegs |= (1 << 15);
+
+        if (data & T_BranchAlways)
+            res.DstRegs |= (1 << 15);
+
+        if (data & T_PopPC && instr & (1 << 8))
+            res.DstRegs |= 1 << 15;
+
+        res.Kind = (data >> 16) & 0x3F;
+
+        return res;
+    }
+    else
+    {
+        u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
+        if ((instr & 0xFE000000) == 0xFA000000)
+            data = A_BLX_IMM;
+
+        if (data & A_ARM9Only && num != 0)
+            data |= A_BranchAlways | A_Link;
+
+        if (data & A_Read0)
+            res.SrcRegs |= 1 << (instr & 0xF);
+        if (data & A_Read16)
+            res.SrcRegs |= 1 << ((instr >> 16) & 0xF);
+        if (data & A_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0xF);
+        if (data & A_Read12)
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+        
+        if (data & A_Write12)
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+        if (data & A_Write16)
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        
+        if (data & A_MemWriteback && instr & (1 << 21))
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+
+        if (data & A_BranchAlways)
+            res.DstRegs |= 1 << 15;
+        
+        if (data & A_Read12Double)
+        {
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+            res.SrcRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+        if (data & A_Write12Double)
+        {
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+            res.DstRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+
+        if (data & A_Link)
+        {
+            res.DstRegs |= 1 << 14;
+            res.SrcRegs |= 1 << 15;
+        }
+
+        if (data & A_LDMSTM)
+        {
+            res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15);
+            if (instr & (1 << 21))
+                res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        }
+
+        res.Kind = (data >> 13) & 0x1FF;
+
+        return res;
+    }
+}
+
+}
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
new file mode 100644
index 0000000..e717664
--- /dev/null
+++ b/src/ARM_InstrInfo.h
@@ -0,0 +1,232 @@
+#ifndef ARMINSTRINFO_H
+#define ARMINSTRINFO_H
+
+#include "types.h"
+
+namespace ARMInstrInfo
+{
+
+// Instruction kinds, for faster dispatch
+
+#define ak_ALU(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_REG_LSL_IMM_S, \
+    ak_##n##_REG_LSR_IMM_S, \
+    ak_##n##_REG_ASR_IMM_S, \
+    ak_##n##_REG_ROR_IMM_S, \
+    \
+    ak_##n##_REG_LSL_REG_S, \
+    ak_##n##_REG_LSR_REG_S, \
+    ak_##n##_REG_ASR_REG_S, \
+    ak_##n##_REG_ROR_REG_S, \
+    \
+    ak_##n##_IMM_S \
+
+#define ak_Test(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM
+
+#define ak_WB_LDRSTR(n) \
+    ak_##n##_REG_LSL, \
+    ak_##n##_REG_LSR, \
+    ak_##n##_REG_ASR, \
+    ak_##n##_REG_ROR, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG_LSL, \
+    ak_##n##_POST_REG_LSR, \
+    ak_##n##_POST_REG_ASR, \
+    ak_##n##_POST_REG_ROR, \
+    \
+    ak_##n##_POST_IMM
+
+#define ak_HD_LDRSTR(n) \
+    ak_##n##_REG, \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG, \
+    ak_##n##_POST_IMM
+
+enum
+{
+    ak_ALU(AND),
+    ak_ALU(EOR),
+    ak_ALU(SUB),
+    ak_ALU(RSB),
+    ak_ALU(ADD),
+    ak_ALU(ADC),
+    ak_ALU(SBC),
+    ak_ALU(RSC),
+    ak_ALU(ORR),
+    ak_ALU(MOV),
+    ak_ALU(BIC),
+    ak_ALU(MVN),
+
+    ak_ALU(TST),
+    ak_ALU(TEQ),
+    ak_ALU(CMP),
+    ak_ALU(CMN),
+
+    ak_MUL,
+    ak_MLA,
+    ak_UMULL,
+    ak_UMLAL,
+    ak_SMULL,
+    ak_SMLAL,
+    ak_SMLAxy,
+    ak_SMLAWy,
+    ak_SMULWy,
+    ak_SMLALxy,
+    ak_SMULxy,
+
+    ak_CLZ,
+
+    ak_QADD,
+    ak_QSUB,
+    ak_QDADD,
+    ak_QDSUB,
+
+    ak_WB_LDRSTR(STR),
+    ak_WB_LDRSTR(STRB),
+    ak_WB_LDRSTR(LDR),
+    ak_WB_LDRSTR(LDRB),
+
+    ak_HD_LDRSTR(STRH),
+    ak_HD_LDRSTR(LDRD),
+    ak_HD_LDRSTR(STRD),
+    ak_HD_LDRSTR(LDRH),
+    ak_HD_LDRSTR(LDRSB),
+    ak_HD_LDRSTR(LDRSH),
+
+    ak_SWP,
+    ak_SWPB,
+
+    ak_LDM,
+    ak_STM,
+
+    ak_B,
+    ak_BL,
+    ak_BLX_IMM,
+    ak_BX,
+    ak_BLX_REG,
+
+    ak_UNK,
+    ak_MSR_IMM,
+    ak_MSR_REG,
+    ak_MRS,
+    ak_MCR,
+    ak_MRC,
+    ak_SVC,
+
+    ak_Count,
+
+    tk_LSL_IMM = 0,
+    tk_LSR_IMM,
+    tk_ASR_IMM,
+
+    tk_ADD_REG_,
+    tk_SUB_REG_,
+    tk_ADD_IMM_,
+    tk_SUB_IMM_,
+
+    tk_MOV_IMM,
+    tk_CMP_IMM,
+    tk_ADD_IMM,
+    tk_SUB_IMM,
+
+    tk_AND_REG,
+    tk_EOR_REG,
+    tk_LSL_REG,
+    tk_LSR_REG,
+    tk_ASR_REG,
+    tk_ADC_REG,
+    tk_SBC_REG,
+    tk_ROR_REG,
+    tk_TST_REG,
+    tk_NEG_REG,
+    tk_CMP_REG,
+    tk_CMN_REG,
+    tk_ORR_REG,
+    tk_MUL_REG,
+    tk_BIC_REG,
+    tk_MVN_REG,
+
+    tk_ADD_HIREG,
+    tk_CMP_HIREG,
+    tk_MOV_HIREG,
+
+    tk_ADD_PCREL,
+    tk_ADD_SPREL,
+    tk_ADD_SP,
+
+    tk_LDR_PCREL,
+    tk_STR_REG,
+    tk_STRB_REG,
+    tk_LDR_REG,
+    tk_LDRB_REG,
+    tk_STRH_REG,
+    tk_LDRSB_REG,
+    tk_LDRH_REG,
+    tk_LDRSH_REG,
+    tk_STR_IMM,
+    tk_LDR_IMM,
+    tk_STRB_IMM,
+    tk_LDRB_IMM,
+    tk_STRH_IMM,
+    tk_LDRH_IMM,
+    tk_STR_SPREL,
+    tk_LDR_SPREL,
+
+    tk_PUSH,
+    tk_POP,
+    tk_LDMIA,
+    tk_STMIA,
+    tk_BCOND,
+    tk_BX,
+    tk_BLX_REG,
+    tk_B,
+    tk_BL_LONG_1,
+    tk_BL_LONG_2,
+    tk_UNK,
+    tk_SVC,
+
+    tk_Count
+};
+
+struct Info
+{
+    u16 DstRegs, SrcRegs;
+    u16 Kind;
+
+    bool Branches()
+    {
+        return DstRegs & (1 << 15);
+    }
+};
+
+Info Decode(bool thumb, u32 num, u32 instr);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5537e6d..87200ad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 project(core)
 
+set (CMAKE_CXX_STANDARD 14)
+
 add_library(core STATIC
 	ARCodeList.cpp
 	AREngine.cpp
@@ -8,6 +10,7 @@ add_library(core STATIC
 	ARMInterpreter_ALU.cpp
 	ARMInterpreter_Branch.cpp
 	ARMInterpreter_LoadStore.cpp
+	ARM_InstrInfo.cpp
 	Config.cpp
 	CP15.cpp
 	CRC32.cpp
@@ -27,6 +30,15 @@ add_library(core STATIC
 	SPU.cpp
 	Wifi.cpp
 	WifiAP.cpp
+
+	ARMJIT.cpp
+	ARMJIT_x64/ARMJIT_Compiler.cpp
+
+	dolphin/CommonFuncs.cpp
+	dolphin/x64ABI.cpp
+	dolphin/x64CPUDetect.cpp
+	dolphin/x64Emitter.cpp
+	dolphin/MemoryUtil.cpp
 )
 
 if (WIN32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index c1f46bc..f232bec 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -20,6 +20,7 @@
 #include <string.h>
 #include "NDS.h"
 #include "ARM.h"
+#include "ARMJIT.h"
 
 
 // access timing for cached regions
@@ -811,6 +812,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -832,6 +834,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -853,6 +856,8 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -874,6 +879,8 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/NDS.cpp b/src/NDS.cpp
index a2ab6ce..b8fd8cb 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -32,6 +32,7 @@
 #include "Wifi.h"
 #include "AREngine.h"
 #include "Platform.h"
+#include "ARMJIT.h"
 
 
 namespace NDS
@@ -161,6 +162,8 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+    ARMJIT::Init();
+
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
     DMAs[2] = new DMA(0, 2);
@@ -191,6 +194,8 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+    ARMJIT::DeInit();
+
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
 
@@ -1822,6 +1827,8 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1872,6 +1879,8 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1938,6 +1947,8 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2231,6 +2242,8 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2290,6 +2303,8 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2359,6 +2374,8 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h
new file mode 100644
index 0000000..4eb16e0
--- /dev/null
+++ b/src/dolphin/Assert.h
@@ -0,0 +1,47 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <assert.h>
+
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h
new file mode 100644
index 0000000..d32b020
--- /dev/null
+++ b/src/dolphin/BitSet.h
@@ -0,0 +1,218 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "../types.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+namespace Common
+{
+template <typename T>
+constexpr int CountSetBits(T v)
+{
+  // from https://graphics.stanford.edu/~seander/bithacks.html
+  // GCC has this built in, but MSVC's intrinsic will only emit the actual
+  // POPCNT instruction, which we're not depending on
+  v = v - ((v >> 1) & (T) ~(T)0 / 3);
+  v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3);
+  v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15;
+  return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8;
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  unsigned long index;
+  _BitScanForward64(&index, val);
+  return (int)index;
+}
+#else
+namespace Common
+{
+constexpr int CountSetBits(u8 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u16 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u32 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u64 val)
+{
+  return __builtin_popcountll(val);
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  return __builtin_ctzll(val);
+}
+#endif
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+  static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+
+public:
+  // A reference to a particular bit, returned from operator[].
+  class Ref
+  {
+  public:
+    constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+    constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+    constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+    bool operator=(bool set)
+    {
+      m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+      return set;
+    }
+
+  private:
+    BitSet* m_bs;
+    IntTy m_mask;
+  };
+
+  // A STL-like iterator is required to be able to use range-based for loops.
+  class Iterator
+  {
+  public:
+    constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+    constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+    Iterator& operator=(Iterator other)
+    {
+      new (this) Iterator(other);
+      return *this;
+    }
+    Iterator& operator++()
+    {
+      if (m_val == 0)
+      {
+        m_bit = -1;
+      }
+      else
+      {
+        int bit = LeastSignificantSetBit(m_val);
+        m_val &= ~(1 << bit);
+        m_bit = bit;
+      }
+      return *this;
+    }
+    Iterator operator++(int)
+    {
+      Iterator other(*this);
+      ++*this;
+      return other;
+    }
+    constexpr int operator*() const { return m_bit; }
+    constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+    constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+
+  private:
+    IntTy m_val;
+    int m_bit;
+  };
+
+  constexpr BitSet() : m_val(0) {}
+  constexpr explicit BitSet(IntTy val) : m_val(val) {}
+  BitSet(std::initializer_list<int> init)
+  {
+    m_val = 0;
+    for (int bit : init)
+      m_val |= (IntTy)1 << bit;
+  }
+
+  constexpr static BitSet AllTrue(size_t count)
+  {
+    return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+  }
+
+  Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+  constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+  constexpr bool operator==(BitSet other) const { return m_val == other.m_val; }
+  constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; }
+  constexpr bool operator<(BitSet other) const { return m_val < other.m_val; }
+  constexpr bool operator>(BitSet other) const { return m_val > other.m_val; }
+  constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+  constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+  constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+  constexpr BitSet operator~() const { return BitSet(~m_val); }
+  constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); }
+  constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); }
+  constexpr explicit operator bool() const { return m_val != 0; }
+  BitSet& operator|=(BitSet other) { return *this = *this | other; }
+  BitSet& operator&=(BitSet other) { return *this = *this & other; }
+  BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+  BitSet& operator<<=(IntTy shift) { return *this = *this << shift; }
+  BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; }
+  // Warning: Even though on modern CPUs this is a single fast instruction,
+  // Dolphin's official builds do not currently assume POPCNT support on x86,
+  // so slower explicit bit twiddling is generated.  Still should generally
+  // be faster than a loop.
+  constexpr unsigned int Count() const { return CountSetBits(m_val); }
+  constexpr Iterator begin() const { return ++Iterator(m_val, 0); }
+  constexpr Iterator end() const { return Iterator(m_val, -1); }
+  IntTy m_val;
+};
+}  // namespace Common
+
+using BitSet8 = Common::BitSet<u8>;
+using BitSet16 = Common::BitSet<u16>;
+using BitSet32 = Common::BitSet<u32>;
+using BitSet64 = Common::BitSet<u64>;
diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h
new file mode 100644
index 0000000..bd4fd8d
--- /dev/null
+++ b/src/dolphin/CPUDetect.h
@@ -0,0 +1,76 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// Detect the CPU, so we'll know which optimizations to use
+#pragma once
+
+#include <string>
+
+enum class CPUVendor
+{
+  Intel,
+  AMD,
+  ARM,
+  Other,
+};
+
+struct CPUInfo
+{
+  CPUVendor vendor = CPUVendor::Intel;
+
+  char cpu_string[0x41] = {};
+  char brand_string[0x21] = {};
+  bool OS64bit = false;
+  bool CPU64bit = false;
+  bool Mode64bit = false;
+
+  bool HTT = false;
+  int num_cores = 0;
+  int logical_cpu_count = 0;
+
+  bool bSSE = false;
+  bool bSSE2 = false;
+  bool bSSE3 = false;
+  bool bSSSE3 = false;
+  bool bPOPCNT = false;
+  bool bSSE4_1 = false;
+  bool bSSE4_2 = false;
+  bool bLZCNT = false;
+  bool bSSE4A = false;
+  bool bAVX = false;
+  bool bAVX2 = false;
+  bool bBMI1 = false;
+  bool bBMI2 = false;
+  bool bFMA = false;
+  bool bFMA4 = false;
+  bool bAES = false;
+  // FXSAVE/FXRSTOR
+  bool bFXSR = false;
+  bool bMOVBE = false;
+  // This flag indicates that the hardware supports some mode
+  // in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+  bool bFlushToZero = false;
+  bool bLAHFSAHF64 = false;
+  bool bLongMode = false;
+  bool bAtom = false;
+
+  // ARMv8 specific
+  bool bFP = false;
+  bool bASIMD = false;
+  bool bCRC32 = false;
+  bool bSHA1 = false;
+  bool bSHA2 = false;
+
+  // Call Detect()
+  explicit CPUInfo();
+
+  // Turn the CPU info into a string we can show
+  std::string Summarize();
+
+private:
+  // Detects the various CPU features
+  void Detect();
+};
+
+extern CPUInfo cpu_info;
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
new file mode 100644
index 0000000..1434297
--- /dev/null
+++ b/src/dolphin/CodeBlock.h
@@ -0,0 +1,121 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "Assert.h"
+#include "../types.h"
+#include "MemoryUtil.h"
+
+namespace Common
+{
+// Everything that needs to generate code should inherit from this.
+// You get memory management for free, plus, you can use all emitter functions without
+// having to prefix them with gen-> or something similar.
+// Example implementation:
+// class JIT : public CodeBlock<ARMXEmitter> {}
+template <class T>
+class CodeBlock : public T
+{
+private:
+  // A privately used function to set the executable RAM space to something invalid.
+  // For debugging usefulness it should be used to set the RAM to a host specific breakpoint
+  // instruction
+  virtual void PoisonMemory() = 0;
+
+protected:
+  u8* region = nullptr;
+  // Size of region we can use.
+  size_t region_size = 0;
+  // Original size of the region we allocated.
+  size_t total_region_size = 0;
+
+  bool m_is_child = false;
+  std::vector<CodeBlock*> m_children;
+
+public:
+  CodeBlock() = default;
+  virtual ~CodeBlock()
+  {
+    if (region)
+      FreeCodeSpace();
+  }
+  CodeBlock(const CodeBlock&) = delete;
+  CodeBlock& operator=(const CodeBlock&) = delete;
+  CodeBlock(CodeBlock&&) = delete;
+  CodeBlock& operator=(CodeBlock&&) = delete;
+
+  // Call this before you generate any code.
+  void AllocCodeSpace(size_t size)
+  {
+    region_size = size;
+    total_region_size = size;
+    region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
+    T::SetCodePtr(region);
+  }
+
+  // Always clear code space with breakpoints, so that if someone accidentally executes
+  // uninitialized, it just breaks into the debugger.
+  void ClearCodeSpace()
+  {
+    PoisonMemory();
+    ResetCodePtr();
+  }
+
+  // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+  void FreeCodeSpace()
+  {
+    ASSERT(!m_is_child);
+    Common::FreeMemoryPages(region, total_region_size);
+    region = nullptr;
+    region_size = 0;
+    total_region_size = 0;
+    for (CodeBlock* child : m_children)
+    {
+      child->region = nullptr;
+      child->region_size = 0;
+      child->total_region_size = 0;
+    }
+  }
+
+  bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
+  // Cannot currently be undone. Will write protect the entire code region.
+  // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+  void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
+  void ResetCodePtr() { T::SetCodePtr(region); }
+  size_t GetSpaceLeft() const
+  {
+    ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
+    return region_size - (T::GetCodePtr() - region);
+  }
+
+  bool IsAlmostFull() const
+  {
+    // This should be bigger than the biggest block ever.
+    return GetSpaceLeft() < 0x10000;
+  }
+
+  bool HasChildren() const { return region_size != total_region_size; }
+  u8* AllocChildCodeSpace(size_t child_size)
+  {
+    ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
+    u8* child_region = region + region_size - child_size;
+    region_size -= child_size;
+    return child_region;
+  }
+  void AddChildCodeSpace(CodeBlock* child, size_t child_size)
+  {
+    u8* child_region = AllocChildCodeSpace(child_size);
+    child->m_is_child = true;
+    child->region = child_region;
+    child->region_size = child_size;
+    child->total_region_size = child_size;
+    child->ResetCodePtr();
+    m_children.emplace_back(child);
+  }
+};
+}  // namespace Common
diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp
new file mode 100644
index 0000000..f85051d
--- /dev/null
+++ b/src/dolphin/CommonFuncs.cpp
@@ -0,0 +1,52 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include <errno.h>
+#include <type_traits>
+
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define strerror_r(err, buf, len) strerror_s(buf, len, err)
+#endif
+
+constexpr size_t BUFFER_SIZE = 256;
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  // There are two variants of strerror_r. The XSI version stores the message to the passed-in
+  // buffer and returns an int (0 on success). The GNU version returns a pointer to the message,
+  // which might have been stored in the passed-in buffer or might be a static string.
+
+  // We check defines in order to figure out variant is in use, and we store the returned value
+  // to a variable so that we'll get a compile-time check that our assumption was correct.
+
+#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600))
+  const char* str = strerror_r(errno, error_message, BUFFER_SIZE);
+  return std::string(str);
+#else
+  int error_code = strerror_r(errno, error_message, BUFFER_SIZE);
+  return error_code == 0 ? std::string(error_message) : "";
+#endif
+}
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(),
+                 MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr);
+  return std::string(error_message);
+}
+#endif
diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h
new file mode 100644
index 0000000..708fbc3
--- /dev/null
+++ b/src/dolphin/CommonFuncs.h
@@ -0,0 +1,58 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "../types.h"
+
+// Will fail to compile on a non-array:
+template <typename T, size_t N>
+constexpr size_t ArraySize(T (&arr)[N])
+{
+  return N;
+}
+
+#ifndef _WIN32
+
+// go to debugger mode
+#define Crash()                                                                                    \
+  {                                                                                                \
+    __builtin_trap();                                                                              \
+  }
+
+#else  // WIN32
+// Function Cross-Compatibility
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define unlink _unlink
+#define vscprintf _vscprintf
+
+// 64 bit offsets for Windows
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define atoll _atoi64
+#define stat _stat64
+#define fstat _fstat64
+#define fileno _fileno
+
+extern "C" {
+__declspec(dllimport) void __stdcall DebugBreak(void);
+}
+#define Crash()                                                                                    \
+  {                                                                                                \
+    DebugBreak();                                                                                  \
+  }
+#endif  // WIN32 ndef
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString();
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString();
+#endif
diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h
new file mode 100644
index 0000000..483f219
--- /dev/null
+++ b/src/dolphin/Intrinsics.h
@@ -0,0 +1,72 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#if defined(_M_X86)
+
+/**
+ * It is assumed that all compilers used to build Dolphin support intrinsics up to and including
+ * SSE 4.2 on x86/x64.
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+
+/**
+ * Due to limitations in GCC, SSE intrinsics are only available when compiling with the
+ * corresponding instruction set enabled. However, using the target attribute, we can compile
+ * single functions with a different target instruction set, while still creating a generic build.
+ *
+ * Since this instruction set is enabled per-function, any callers should verify that the
+ * instruction set is supported at runtime before calling it, and provide a fallback implementation
+ * when not supported.
+ *
+ * When building with -march=native, or enabling the instruction sets in the compile flags, permit
+ * usage of the instrinsics without any function attributes. If the command-line architecture does
+ * not support this instruction set, enable it via function targeting.
+ */
+
+#include <x86intrin.h>
+#ifndef __SSE4_2__
+#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]]
+#endif
+#ifndef __SSE4_1__
+#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]]
+#endif
+#ifndef __SSSE3__
+#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]]
+#endif
+#ifndef __SSE3__
+#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]]
+#endif
+
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+/**
+ * MSVC and ICC support intrinsics for any instruction set without any function attributes.
+ */
+#include <intrin.h>
+
+#endif  // defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+#endif  // _M_X86
+
+/**
+ * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform.
+ * This way when a function is defined with FUNCTION_TARGET you don't need to define a second
+ * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use
+ * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures.
+ */
+#ifndef FUNCTION_TARGET_SSE42
+#define FUNCTION_TARGET_SSE42
+#endif
+#ifndef FUNCTION_TARGET_SSR41
+#define FUNCTION_TARGET_SSR41
+#endif
+#ifndef FUNCTION_TARGET_SSSE3
+#define FUNCTION_TARGET_SSSE3
+#endif
+#ifndef FUNCTION_TARGET_SSE3
+#define FUNCTION_TARGET_SSE3
+#endif
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
new file mode 100644
index 0000000..21e69a5
--- /dev/null
+++ b/src/dolphin/Log.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "CommonFuncs.h"
+
+#include <stdio.h>
+
+#define PanicAlert(msg) \
+    do \
+    { \
+        printf("%s\n", msg); \
+        Crash(); \
+    } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
new file mode 100644
index 0000000..01cb897
--- /dev/null
+++ b/src/dolphin/MemoryUtil.cpp
@@ -0,0 +1,193 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#include "../types.h"
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+//#include "Common/StringUtil.h"
+#else
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#if defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+#include <sys/sysctl.h>
+#elif defined __HAIKU__
+#include <OS.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+namespace Common
+{
+// This is purposely not a full wrapper for virtualalloc/mmap, but it
+// provides exactly the primitive operations that Dolphin needs.
+
+void* AllocateExecutableMemory(size_t size)
+{
+  printf("c\n");
+
+#if defined(_WIN32)
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+  void* ptr =
+      mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+  printf("a\n");
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate executable memory");
+
+  printf("b\n");
+
+  return ptr;
+}
+
+void* AllocateMemoryPages(size_t size)
+{
+#ifdef _WIN32
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate raw memory");
+
+  return ptr;
+}
+
+void* AllocateAlignedMemory(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+  void* ptr = _aligned_malloc(size, alignment);
+#else
+  void* ptr = nullptr;
+  if (posix_memalign(&ptr, alignment, size) != 0)
+    ERROR_LOG(MEMMAP, "Failed to allocate aligned memory");
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate aligned memory");
+
+  return ptr;
+}
+
+void FreeMemoryPages(void* ptr, size_t size)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    if (!VirtualFree(ptr, 0, MEM_RELEASE))
+      PanicAlert("FreeMemoryPages failed!\nVirtualFree: %s", GetLastErrorString().c_str());
+#else
+    if (munmap(ptr, size) != 0)
+      PanicAlert("FreeMemoryPages failed!\nmunmap: %s", LastStrerrorString().c_str());
+#endif
+  }
+}
+
+void FreeAlignedMemory(void* ptr)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+  }
+}
+
+void ReadProtectMemory(void* ptr, size_t size)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue))
+    PanicAlert("ReadProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, PROT_NONE) != 0)
+    PanicAlert("ReadProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, &oldValue))
+    PanicAlert("WriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, allowExecute ? (PROT_READ | PROT_EXEC) : PROT_READ) != 0)
+    PanicAlert("WriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldValue))
+    PanicAlert("UnWriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size,
+               allowExecute ? (PROT_READ | PROT_WRITE | PROT_EXEC) : PROT_WRITE | PROT_READ) != 0)
+  {
+    PanicAlert("UnWriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+  }
+#endif
+}
+
+size_t MemPhysical()
+{
+#ifdef _WIN32
+  MEMORYSTATUSEX memInfo;
+  memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+  GlobalMemoryStatusEx(&memInfo);
+  return memInfo.ullTotalPhys;
+#elif defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+  int mib[2];
+  size_t physical_memory;
+  mib[0] = CTL_HW;
+#ifdef __APPLE__
+  mib[1] = HW_MEMSIZE;
+#elif defined __FreeBSD__
+  mib[1] = HW_REALMEM;
+#elif defined __OpenBSD__
+  mib[1] = HW_PHYSMEM;
+#endif
+  size_t length = sizeof(size_t);
+  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+  return physical_memory;
+#elif defined __HAIKU__
+  system_info sysinfo;
+  get_system_info(&sysinfo);
+  return static_cast<size_t>(sysinfo.max_pages * B_PAGE_SIZE);
+#else
+  struct sysinfo memInfo;
+  sysinfo(&memInfo);
+  return (size_t)memInfo.totalram * memInfo.mem_unit;
+#endif
+}
+
+}  // namespace Common
diff --git a/src/dolphin/MemoryUtil.h b/src/dolphin/MemoryUtil.h
new file mode 100644
index 0000000..607b7a8
--- /dev/null
+++ b/src/dolphin/MemoryUtil.h
@@ -0,0 +1,22 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+namespace Common
+{
+void* AllocateExecutableMemory(size_t size);
+void* AllocateMemoryPages(size_t size);
+void FreeMemoryPages(void* ptr, size_t size);
+void* AllocateAlignedMemory(size_t size, size_t alignment);
+void FreeAlignedMemory(void* ptr);
+void ReadProtectMemory(void* ptr, size_t size);
+void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false);
+size_t MemPhysical();
+
+}  // namespace Common
diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/src/dolphin/license_dolphin.txt
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp
new file mode 100644
index 0000000..d86a158
--- /dev/null
+++ b/src/dolphin/x64ABI.cpp
@@ -0,0 +1,119 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include "../types.h"
+#include "x64ABI.h"
+#include "x64Emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                      size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+  size_t shadow = 0;
+#if defined(_WIN32)
+  shadow = 0x20;
+#endif
+
+  int count = (mask & ABI_ALL_GPRS).Count();
+  rsp_alignment -= count * 8;
+  size_t subtraction = 0;
+  int fpr_count = (mask & ABI_ALL_FPRS).Count();
+  if (fpr_count)
+  {
+    // If we have any XMMs to save, we must align the stack here.
+    subtraction = rsp_alignment & 0xf;
+  }
+  subtraction += 16 * fpr_count;
+  size_t xmm_base_subtraction = subtraction;
+  subtraction += needed_frame_size;
+  subtraction += shadow;
+  // Final alignment.
+  rsp_alignment -= subtraction;
+  subtraction += rsp_alignment & 0xf;
+
+  *shadowp = shadow;
+  *subtractionp = subtraction;
+  *xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                                 size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int r : mask& ABI_ALL_GPRS)
+    PUSH((X64Reg)r);
+
+  if (subtraction)
+    SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
+    xmm_offset += 16;
+  }
+
+  return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                              size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset));
+    xmm_offset += 16;
+  }
+
+  if (subtraction)
+    ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int r = 15; r >= 0; r--)
+  {
+    if (mask[r])
+      POP((X64Reg)r);
+  }
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2,
+                      Gen::X64Reg src2)
+{
+  if (dst1 == src2 && dst2 == src1)
+  {
+    XCHG(bits, R(src1), R(src2));
+    if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+  else if (src2 != dst1)
+  {
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+  }
+  else
+  {
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+}
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
new file mode 100644
index 0000000..997782e
--- /dev/null
+++ b/src/dolphin/x64ABI.h
@@ -0,0 +1,57 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include "BitSet.h"
+#include "x64Reg.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself
+// calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch:      RAX RCX RDX R8 R9 R10 R11
+// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters:   RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save:  RBX RBP R12 R13 R14 R15
+// Parameters:   RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32  // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED                                                                       \
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11})
+#else  // 64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS)
+#endif  // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
new file mode 100644
index 0000000..05ee11c
--- /dev/null
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -0,0 +1,274 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstring>
+#include <string>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Intrinsics.h"
+
+#ifndef _MSVC_VER
+
+#ifdef __FreeBSD__
+#include <unistd.h>
+
+#include <machine/cpufunc.h>
+#include <sys/types.h>
+#endif
+
+static inline void __cpuidex(int info[4], int function_id, int subfunction_id)
+{
+#ifdef __FreeBSD__
+  // Despite the name, this is just do_cpuid() with ECX as second input.
+  cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
+#else
+  info[0] = function_id;     // eax
+  info[2] = subfunction_id;  // ecx
+  __asm__("cpuid"
+          : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+          : "a"(function_id), "c"(subfunction_id));
+#endif
+}
+
+static inline void __cpuid(int info[4], int function_id)
+{
+  return __cpuidex(info, function_id, 0);
+}
+
+#endif  // ifndef _WIN32
+
+#ifdef _MSVC_VER
+
+static u64 xgetbv(u32 index)
+{
+  return _xgetbv(index);
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK;
+
+#else
+
+static u64 xgetbv(u32 index)
+{
+  u32 eax, edx;
+  __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+  return ((u64)edx << 32) | eax;
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0;
+#endif  // ifdef _WIN32
+
+CPUInfo cpu_info;
+
+CPUInfo::CPUInfo()
+{
+  Detect();
+}
+
+// Detects the various CPU features
+void CPUInfo::Detect()
+{
+#ifdef _M_X86_64
+  Mode64bit = true;
+  OS64bit = true;
+#endif
+  num_cores = 1;
+
+  // Set obvious defaults, for extra safety
+  if (Mode64bit)
+  {
+    bSSE = true;
+    bSSE2 = true;
+    bLongMode = true;
+  }
+
+  // Assume CPU supports the CPUID instruction. Those that don't can barely
+  // boot modern OS:es anyway.
+  int cpu_id[4];
+
+  // Detect CPU's CPUID capabilities, and grab CPU string
+  __cpuid(cpu_id, 0x00000000);
+  u32 max_std_fn = cpu_id[0];  // EAX
+  std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int));
+  std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int));
+  std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int));
+  __cpuid(cpu_id, 0x80000000);
+  u32 max_ex_fn = cpu_id[0];
+  if (!strcmp(brand_string, "GenuineIntel"))
+    vendor = CPUVendor::Intel;
+  else if (!strcmp(brand_string, "AuthenticAMD"))
+    vendor = CPUVendor::AMD;
+  else
+    vendor = CPUVendor::Other;
+
+  // Set reasonable default brand string even if brand string not available.
+  strcpy(cpu_string, brand_string);
+
+  // Detect family and other misc stuff.
+  bool ht = false;
+  HTT = ht;
+  logical_cpu_count = 1;
+  if (max_std_fn >= 1)
+  {
+    __cpuid(cpu_id, 0x00000001);
+    int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+    int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+    // Detect people unfortunate enough to be running Dolphin on an Atom
+    if (family == 6 &&
+        (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
+         model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+      bAtom = true;
+    logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
+    ht = (cpu_id[3] >> 28) & 1;
+
+    if ((cpu_id[3] >> 25) & 1)
+      bSSE = true;
+    if ((cpu_id[3] >> 26) & 1)
+      bSSE2 = true;
+    if ((cpu_id[2]) & 1)
+      bSSE3 = true;
+    if ((cpu_id[2] >> 9) & 1)
+      bSSSE3 = true;
+    if ((cpu_id[2] >> 19) & 1)
+      bSSE4_1 = true;
+    if ((cpu_id[2] >> 20) & 1)
+      bSSE4_2 = true;
+    if ((cpu_id[2] >> 22) & 1)
+      bMOVBE = true;
+    if ((cpu_id[2] >> 25) & 1)
+      bAES = true;
+
+    if ((cpu_id[3] >> 24) & 1)
+    {
+      // We can use FXSAVE.
+      bFXSR = true;
+    }
+
+    // AVX support requires 3 separate checks:
+    //  - Is the AVX bit set in CPUID?
+    //  - Is the XSAVE bit set in CPUID?
+    //  - XGETBV result has the XCR bit set.
+    if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+    {
+      if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+      {
+        bAVX = true;
+        if ((cpu_id[2] >> 12) & 1)
+          bFMA = true;
+      }
+    }
+
+    if (max_std_fn >= 7)
+    {
+      __cpuidex(cpu_id, 0x00000007, 0x00000000);
+      // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+      if ((cpu_id[1] >> 5) & 1)
+        bAVX2 = bAVX;
+      if ((cpu_id[1] >> 3) & 1)
+        bBMI1 = true;
+      if ((cpu_id[1] >> 8) & 1)
+        bBMI2 = true;
+    }
+  }
+
+  bFlushToZero = bSSE;
+
+  if (max_ex_fn >= 0x80000004)
+  {
+    // Extract CPU model string
+    __cpuid(cpu_id, 0x80000002);
+    memcpy(cpu_string, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000003);
+    memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000004);
+    memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id));
+  }
+  if (max_ex_fn >= 0x80000001)
+  {
+    // Check for more features.
+    __cpuid(cpu_id, 0x80000001);
+    if (cpu_id[2] & 1)
+      bLAHFSAHF64 = true;
+    if ((cpu_id[2] >> 5) & 1)
+      bLZCNT = true;
+    if ((cpu_id[2] >> 16) & 1)
+      bFMA4 = true;
+    if ((cpu_id[3] >> 29) & 1)
+      bLongMode = true;
+  }
+
+  num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count;
+
+  if (max_ex_fn >= 0x80000008)
+  {
+    // Get number of cores. This is a bit complicated. Following AMD manual here.
+    __cpuid(cpu_id, 0x80000008);
+    int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF;
+    if (apic_id_core_id_size == 0)
+    {
+      if (ht)
+      {
+        // New mechanism for modern Intel CPUs.
+        if (vendor == CPUVendor::Intel)
+        {
+          __cpuidex(cpu_id, 0x00000004, 0x00000000);
+          int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1;
+          HTT = (cores_x_package < logical_cpu_count);
+          cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1;
+          num_cores = (cores_x_package > 1) ? cores_x_package : num_cores;
+          logical_cpu_count /= cores_x_package;
+        }
+      }
+    }
+    else
+    {
+      // Use AMD's new method.
+      num_cores = (cpu_id[2] & 0xFF) + 1;
+    }
+  }
+}
+
+// Turn the CPU info into a string we can show
+std::string CPUInfo::Summarize()
+{
+  std::string sum(cpu_string);
+  sum += " (";
+  sum += brand_string;
+  sum += ")";
+
+  if (bSSE)
+    sum += ", SSE";
+  if (bSSE2)
+  {
+    sum += ", SSE2";
+    if (!bFlushToZero)
+      sum += " (but not DAZ!)";
+  }
+  if (bSSE3)
+    sum += ", SSE3";
+  if (bSSSE3)
+    sum += ", SSSE3";
+  if (bSSE4_1)
+    sum += ", SSE4.1";
+  if (bSSE4_2)
+    sum += ", SSE4.2";
+  if (HTT)
+    sum += ", HTT";
+  if (bAVX)
+    sum += ", AVX";
+  if (bAVX2)
+    sum += ", AVX2";
+  if (bBMI1)
+    sum += ", BMI1";
+  if (bBMI2)
+    sum += ", BMI2";
+  if (bFMA)
+    sum += ", FMA";
+  if (bAES)
+    sum += ", AES";
+  if (bMOVBE)
+    sum += ", MOVBE";
+  if (bLongMode)
+    sum += ", 64-bit support";
+  return sum;
+}
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
new file mode 100644
index 0000000..7849624
--- /dev/null
+++ b/src/dolphin/x64Emitter.cpp
@@ -0,0 +1,3398 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Log.h"
+#include "x64Emitter.h"
+#include "x64Reg.h"
+
+namespace Gen
+{
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+  u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] = {
+    {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0},  // ADD
+    {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2},  // ADC
+
+    {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5},  // SUB
+    {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3},  // SBB
+
+    {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4},  // AND
+    {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1},  // OR
+
+    {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6},  // XOR
+    {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0},  // MOV
+
+    {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0},  // TEST (to == from)
+    {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7},  // CMP
+
+    {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7},  // XCHG
+};
+
+enum NormalSSEOps
+{
+  sseCMP = 0xC2,
+  sseADD = 0x58,   // ADD
+  sseSUB = 0x5C,   // SUB
+  sseAND = 0x54,   // AND
+  sseANDN = 0x55,  // ANDN
+  sseOR = 0x56,
+  sseXOR = 0x57,
+  sseMUL = 0x59,          // MUL
+  sseDIV = 0x5E,          // DIV
+  sseMIN = 0x5D,          // MIN
+  sseMAX = 0x5F,          // MAX
+  sseCOMIS = 0x2F,        // COMIS
+  sseUCOMIS = 0x2E,       // UCOMIS
+  sseSQRT = 0x51,         // SQRT
+  sseRCP = 0x53,          // RCP
+  sseRSQRT = 0x52,        // RSQRT (NO DOUBLE PRECISION!!!)
+  sseMOVAPfromRM = 0x28,  // MOVAP from RM
+  sseMOVAPtoRM = 0x29,    // MOVAP to RM
+  sseMOVUPfromRM = 0x10,  // MOVUP from RM
+  sseMOVUPtoRM = 0x11,    // MOVUP to RM
+  sseMOVLPfromRM = 0x12,
+  sseMOVLPtoRM = 0x13,
+  sseMOVHPfromRM = 0x16,
+  sseMOVHPtoRM = 0x17,
+  sseMOVHLPS = 0x12,
+  sseMOVLHPS = 0x16,
+  sseMOVDQfromRM = 0x6F,
+  sseMOVDQtoRM = 0x7F,
+  sseMASKMOVDQU = 0xF7,
+  sseLDDQU = 0xF0,
+  sseSHUF = 0xC6,
+  sseMOVNTDQ = 0xE7,
+  sseMOVNTP = 0x2B,
+};
+
+enum class NormalOp
+{
+  ADD,
+  ADC,
+  SUB,
+  SBB,
+  AND,
+  OR,
+  XOR,
+  MOV,
+  TEST,
+  CMP,
+  XCHG,
+};
+
+enum class FloatOp
+{
+  LD = 0,
+  ST = 2,
+  STP = 3,
+  LD80 = 5,
+  STP80 = 7,
+
+  Invalid = -1,
+};
+
+void XEmitter::SetCodePtr(u8* ptr)
+{
+  code = ptr;
+}
+
+const u8* XEmitter::GetCodePtr() const
+{
+  return code;
+}
+
+u8* XEmitter::GetWritableCodePtr()
+{
+  return code;
+}
+
+void XEmitter::Write8(u8 value)
+{
+  *code++ = value;
+}
+
+void XEmitter::Write16(u16 value)
+{
+  std::memcpy(code, &value, sizeof(u16));
+  code += sizeof(u16);
+}
+
+void XEmitter::Write32(u32 value)
+{
+  std::memcpy(code, &value, sizeof(u32));
+  code += sizeof(u32);
+}
+
+void XEmitter::Write64(u64 value)
+{
+  std::memcpy(code, &value, sizeof(u64));
+  code += sizeof(u64);
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+  for (int i = 0; i < bytes; i++)
+    *code++ = 0xCC;
+}
+
+u8* XEmitter::AlignCodeTo(size_t alignment)
+{
+  ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0,
+             "Alignment must be power of two");
+  u64 c = reinterpret_cast<u64>(code) & (alignment - 1);
+  if (c)
+    ReserveCodeSpace(static_cast<int>(alignment - c));
+  return code;
+}
+
+u8* XEmitter::AlignCode4()
+{
+  return AlignCodeTo(4);
+}
+
+u8* XEmitter::AlignCode16()
+{
+  return AlignCodeTo(16);
+}
+
+u8* XEmitter::AlignCodePage()
+{
+  return AlignCodeTo(4096);
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+  ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+  Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+  Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const
+{
+  if (customOp == -1)
+    customOp = operandReg;
+  u8 op = 0x40;
+  // REX.W (whether operation is a 64-bit operation)
+  if (opBits == 64)
+    op |= 8;
+  // REX.R (whether ModR/M reg field refers to R8-R15.
+  if (customOp & 8)
+    op |= 4;
+  // REX.X (whether ModR/M SIB index field refers to R8-R15)
+  if (indexReg & 8)
+    op |= 2;
+  // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+  if (offsetOrBaseReg & 8)
+    op |= 1;
+  // Write REX if wr have REX bits to write, or if the operation accesses
+  // SIL, DIL, BPL, or SPL.
+  if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+      (opBits == 8 && (customOp & 0x10c) == 4))
+  {
+    emit->Write8(op);
+    // Check the operation doesn't access AH, BH, CH, or DH.
+    DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
+    DEBUG_ASSERT((customOp & 0x100) == 0);
+  }
+}
+
+void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                     int W) const
+{
+  int R = !(regOp1 & 8);
+  int X = !(indexReg & 8);
+  int B = !(offsetOrBaseReg & 8);
+
+  int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+  // do we need any VEX fields that only appear in the three-byte form?
+  if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+  {
+    u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC5);
+    emit->Write8(RvvvvLpp);
+  }
+  else
+  {
+    u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+    u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC4);
+    emit->Write8(RXBmmmmm);
+    emit->Write8(WvvvvLpp);
+  }
+}
+
+void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
+                      bool warn_64bit_offset) const
+{
+  if (_operandReg == INVALID_REG)
+    _operandReg = (X64Reg)this->operandReg;
+  int mod = 0;
+  int ireg = indexReg;
+  bool SIB = false;
+  int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+  if (scale == SCALE_RIP)  // Also, on 32-bit, just an immediate address
+  {
+    // Oh, RIP addressing.
+    _offsetOrBaseReg = 5;
+    emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+    // TODO : add some checks
+    u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+    s64 distance = (s64)offset - (s64)ripAddr;
+    ASSERT_MSG(DYNA_REC,
+               (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset,
+               "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset);
+    s32 offs = (s32)distance;
+    emit->Write32((u32)offs);
+    return;
+  }
+
+  if (scale == 0)
+  {
+    // Oh, no memory, Just a reg.
+    mod = 3;  // 11
+  }
+  else
+  {
+    // Ah good, no scaling.
+    if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+    {
+      // Okay, we're good. No SIB necessary.
+      int ioff = (int)offset;
+      if (ioff == 0)
+      {
+        mod = 0;
+      }
+      else if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+    else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+    {
+      SIB = true;
+      mod = 0;
+      _offsetOrBaseReg = 5;
+    }
+    else
+    {
+      if ((_offsetOrBaseReg & 7) == 4)  // this would occupy the SIB encoding :(
+      {
+        // So we have to fake it with SIB encoding :(
+        SIB = true;
+      }
+
+      if (scale >= SCALE_1 && scale < SCALE_ATREG)
+      {
+        SIB = true;
+      }
+
+      if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+      {
+        SIB = true;
+        ireg = _offsetOrBaseReg;
+      }
+
+      // Okay, we're fine. Just disp encoding.
+      // We need displacement. Which size?
+      int ioff = (int)(s64)offset;
+      if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+  }
+
+  // Okay. Time to do the actual writing
+  // ModRM byte:
+  int oreg = _offsetOrBaseReg;
+  if (SIB)
+    oreg = 4;
+
+  emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
+
+  if (SIB)
+  {
+    // SIB byte
+    int ss;
+    switch (scale)
+    {
+    case SCALE_NONE:
+      _offsetOrBaseReg = 4;
+      ss = 0;
+      break;  // RSP
+    case SCALE_1:
+      ss = 0;
+      break;
+    case SCALE_2:
+      ss = 1;
+      break;
+    case SCALE_4:
+      ss = 2;
+      break;
+    case SCALE_8:
+      ss = 3;
+      break;
+    case SCALE_NOBASE_2:
+      ss = 1;
+      break;
+    case SCALE_NOBASE_4:
+      ss = 2;
+      break;
+    case SCALE_NOBASE_8:
+      ss = 3;
+      break;
+    case SCALE_ATREG:
+      ss = 0;
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte");
+      ss = 0;
+      break;
+    }
+    emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7)));
+  }
+
+  if (mod == 1)  // 8-bit disp
+  {
+    emit->Write8((u8)(s8)(s32)offset);
+  }
+  else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8))  // 32-bit disp
+  {
+    emit->Write32((u32)offset);
+  }
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+  w = w ? 1 : 0;
+  r = r ? 1 : 0;
+  x = x ? 1 : 0;
+  b = b ? 1 : 0;
+  u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+  if (rx != 0x40)
+    Write8(rx);
+}
+
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
+{
+  u64 fn = (u64)addr;
+  if (!force5Bytes)
+  {
+    s64 distance = (s64)(fn - ((u64)code + 2));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    // 8 bits will do
+    Write8(0xEB);
+    Write8((u8)(s8)distance);
+  }
+  else
+  {
+    s64 distance = (s64)(fn - ((u64)code + 5));
+
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0xE9);
+    Write32((u32)(s32)distance);
+  }
+}
+
+void XEmitter::JMPptr(const OpArg& arg2)
+{
+  OpArg arg = arg2;
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument");
+  arg.operandReg = 4;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+// Can be used to trap other processors, before overwriting their code
+// not used in Dolphin
+void XEmitter::JMPself()
+{
+  Write8(0xEB);
+  Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument");
+  arg.operandReg = 2;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void* fnptr)
+{
+  u64 distance = u64(fnptr) - (u64(code) + 5);
+  ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL,
+             "CALL out of range (%p calls %p)", code, fnptr);
+  Write8(0xE8);
+  Write32(u32(distance));
+}
+
+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = FixupBranch::Type::Branch32Bit;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 5 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0xEB);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0xE9);
+    Write32(0);
+  }
+  return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 6 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0x70 + conditionCode);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32(0);
+  }
+  return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+  u64 fn = (u64)addr;
+  s64 distance = (s64)(fn - ((u64)code + 2));
+  if (distance < -0x80 || distance >= 0x80)
+  {
+    distance = (s64)(fn - ((u64)code + 6));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32((u32)(s32)distance);
+  }
+  else
+  {
+    Write8(0x70 + conditionCode);
+    Write8((u8)(s8)distance);
+  }
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
+{
+  if (branch.type == FixupBranch::Type::Branch8Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    if (!(distance >= -0x80 && distance < 0x80))
+    {
+      printf("miauz\n");
+    }
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    branch.ptr[-1] = (u8)(s8)distance;
+  }
+  else if (branch.type == FixupBranch::Type::Branch32Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+
+    s32 valid_distance = static_cast<s32>(distance);
+    std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32));
+  }
+}
+
+// Single byte opcodes
+// There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3()
+{
+  Write8(0xCC);
+}
+void XEmitter::RET()
+{
+  Write8(0xC3);
+}
+void XEmitter::RET_FAST()
+{
+  Write8(0xF3);
+  Write8(0xC3);
+}  // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to
+   // a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+  DEBUG_ASSERT((int)size > 0);
+  while (true)
+  {
+    switch (size)
+    {
+    case 0:
+      return;
+    case 1:
+      Write8(0x90);
+      return;
+    case 2:
+      Write8(0x66);
+      Write8(0x90);
+      return;
+    case 3:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x00);
+      return;
+    case 4:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x40);
+      Write8(0x00);
+      return;
+    case 5:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 6:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 7:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x80);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 8:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 9:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 10:
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    default:
+      // Even though x86 instructions are allowed to be up to 15 bytes long,
+      // AMD advises against using NOPs longer than 11 bytes because they
+      // carry a performance penalty on CPUs older than AMD family 16h.
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      size -= 11;
+      continue;
+    }
+  }
+}
+
+void XEmitter::PAUSE()
+{
+  Write8(0xF3);
+  NOP();
+}  // use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()
+{
+  CheckFlags();
+  Write8(0xF8);
+}  // clear carry
+void XEmitter::CMC()
+{
+  CheckFlags();
+  Write8(0xF5);
+}  // flip carry
+void XEmitter::STC()
+{
+  CheckFlags();
+  Write8(0xF9);
+}  // set carry
+
+// TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+  Write8(0x86);
+  Write8(0xe0);
+  // alt. 86 c4
+}
+
+// These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF()
+{
+  Write8(0x9F);
+}
+void XEmitter::SAHF()
+{
+  CheckFlags();
+  Write8(0x9E);
+}
+
+void XEmitter::PUSHF()
+{
+  Write8(0x9C);
+}
+void XEmitter::POPF()
+{
+  CheckFlags();
+  Write8(0x9D);
+}
+
+void XEmitter::LFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xE8);
+}
+void XEmitter::MFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF0);
+}
+void XEmitter::SFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF8);
+}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte1);
+  Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, 0);
+  Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+  if (bits == 8)
+    Write8(0x66);
+  Rex(bits == 32, 0, 0, 0);
+  Write8(0x98);
+}
+
+// Simple opcodes
+
+// push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x50, reg);
+}
+void XEmitter::POP(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x58, reg);
+}
+
+void XEmitter::PUSH(int bits, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    PUSH(reg.GetSimpleReg());
+  else if (reg.IsImm())
+  {
+    switch (reg.GetImmBits())
+    {
+    case 8:
+      Write8(0x6A);
+      Write8((u8)(s8)reg.offset);
+      break;
+    case 16:
+      Write8(0x66);
+      Write8(0x68);
+      Write16((u16)(s16)(s32)reg.offset);
+      break;
+    case 32:
+      Write8(0x68);
+      Write32((u32)reg.offset);
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits");
+      break;
+    }
+  }
+  else
+  {
+    if (bits == 16)
+      Write8(0x66);
+    reg.WriteREX(this, bits, bits);
+    Write8(0xFF);
+    reg.WriteRest(this, 0, (X64Reg)6);
+  }
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    POP(reg.GetSimpleReg());
+  else
+    ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+  if (bits >= 32)
+  {
+    WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+  }
+  else if (bits == 16)
+  {
+    ROL(16, R(reg), Imm8(8));
+  }
+  else if (bits == 8)
+  {
+    // Do nothing - can't bswap a single byte...
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+  }
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+  Write8(0x0F);
+  Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+  ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+  arg.operandReg = (u8)level;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0x18);
+  arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+  ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+  dest.operandReg = 0;
+  dest.WriteREX(this, 0, 8);
+  Write8(0x0F);
+  Write8(0x90 + (u8)flag);
+  dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+  ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+  if (bits == 16)
+    Write8(0x66);
+  src.operandReg = dest;
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(0x40 + (u8)flag);
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+  CheckFlags();
+  src.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  src.WriteREX(this, bits, bits, 0);
+  if (bits == 8)
+  {
+    Write8(0xF6);
+  }
+  else
+  {
+    Write8(0xF7);
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 4);
+}
+void XEmitter::DIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 6);
+}
+void XEmitter::IMUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 5);
+}
+void XEmitter::IDIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 7);
+}
+void XEmitter::NEG(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 3);
+}
+void XEmitter::NOT(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 2);
+}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+  CheckFlags();
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);
+  if (rep)
+    Write8(0xF3);
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(byte2);
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
+{
+  if (bits <= 16)
+    ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16");
+  WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBC);
+}  // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBD);
+}  // Top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bLZCNT)
+    PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  src.WriteREX(this, dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xBE);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xBF);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x63);
+  }
+  else
+  {
+    Crash();
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  // the 32bit result is automatically zero extended to 64bit
+  src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xB6);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xB7);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x8B);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size");
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
+{
+  ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+  if (bits == 8)
+  {
+    MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg);
+    return;
+  }
+  if (bits == 16)
+    Write8(0x66);
+  ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!");
+  arg.WriteREX(this, bits, bits, reg);
+  Write8(0x0F);
+  Write8(0x38);
+  Write8(op);
+  arg.WriteRest(this, 0, reg);
+}
+void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteMOVBE(bits, 0xF0, dest, src);
+}
+void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src)
+{
+  WriteMOVBE(bits, 0xF1, src, dest);
+}
+
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info)
+{
+  if (info)
+  {
+    info->address = GetWritableCodePtr();
+    info->nonAtomicSwapStore = false;
+  }
+
+  switch (size)
+  {
+  case 8:
+    if (sign_extend)
+      MOVSX(32, 8, dst, src);
+    else
+      MOVZX(32, 8, dst, src);
+    break;
+  case 16:
+    MOVZX(32, 16, dst, src);
+    if (sign_extend)
+    {
+      BSWAP(32, dst);
+      SAR(32, R(dst), Imm8(16));
+    }
+    else
+    {
+      ROL(16, R(dst), Imm8(8));
+    }
+    break;
+  case 32:
+  case 64:
+    if (cpu_info.bMOVBE)
+    {
+      MOVBE(size, dst, src);
+    }
+    else
+    {
+      MOV(size, R(dst), src);
+      BSWAP(size, dst);
+    }
+    break;
+  }
+}
+
+void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info)
+{
+  if (cpu_info.bMOVBE)
+  {
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = false;
+    }
+    MOVBE(size, dst, src);
+  }
+  else
+  {
+    BSWAP(size, src);
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = true;
+      info->nonAtomicSwapStoreSrc = src;
+    }
+    MOV(size, dst, R(src));
+  }
+}
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);  // TODO: performance warning
+  src.WriteREX(this, bits, bits);
+  Write8(0x8D);
+  src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
+{
+  CheckFlags();
+  bool writeImm = false;
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument");
+  }
+  dest.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  dest.WriteREX(this, bits, bits, 0);
+  if (shift.GetImmBits() == 8)
+  {
+    // ok an imm
+    u8 imm = (u8)shift.offset;
+    if (imm == 1)
+    {
+      Write8(bits == 8 ? 0xD0 : 0xD1);
+    }
+    else
+    {
+      writeImm = true;
+      Write8(bits == 8 ? 0xC0 : 0xC1);
+    }
+  }
+  else
+  {
+    Write8(bits == 8 ? 0xD2 : 0xD3);
+  }
+  dest.WriteRest(this, writeImm ? 1 : 0);
+  if (writeImm)
+    Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on Intel than AMD
+// Intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 0);
+}
+void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 1);
+}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 2);
+}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 3);
+}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 4);
+}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 5);
+}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 7);
+}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms");
+  }
+  if ((index.IsImm() && index.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  if (index.IsImm())
+  {
+    dest.WriteREX(this, bits, bits);
+    Write8(0x0F);
+    Write8(0xBA);
+    dest.WriteRest(this, 1, (X64Reg)ext);
+    Write8((u8)index.offset);
+  }
+  else
+  {
+    X64Reg operand = index.GetSimpleReg();
+    dest.WriteREX(this, bits, bits, operand);
+    Write8(0x0F);
+    Write8(0x83 + 8 * ext);
+    dest.WriteRest(this, 1, operand);
+  }
+}
+
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 4);
+}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 5);
+}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 6);
+}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 7);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xAC);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xAD);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xA4);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xA5);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits)
+{
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  this->operandReg = (u8)_operandReg;
+  WriteREX(emit, bits, bits);
+  emit->Write8(op);
+  WriteRest(emit);
+}
+
+// operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+                          int bits) const
+{
+  X64Reg _operandReg;
+  if (IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+  }
+
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  int immToWrite = 0;
+  const NormalOpDef& op_def = normalops[static_cast<int>(op)];
+
+  if (operand.IsImm())
+  {
+    WriteREX(emit, bits, bits);
+
+    if (!toRM)
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+    }
+
+    if (operand.scale == SCALE_IMM8 && bits == 8)
+    {
+      // op al, imm8
+      if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC)
+      {
+        emit->Write8(op_def.eaximm8);
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // mov reg, imm8
+      if (!scale && op == NormalOp::MOV)
+      {
+        emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // op r/m8, imm8
+      emit->Write8(op_def.imm8);
+      immToWrite = 8;
+    }
+    else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+             (operand.scale == SCALE_IMM32 && bits == 32) ||
+             (operand.scale == SCALE_IMM32 && bits == 64))
+    {
+      // Try to save immediate size if we can, but first check to see
+      // if the instruction supports simm8.
+      // op r/m, imm8
+      if (op_def.simm8 != 0xCC &&
+          ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+           (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+      {
+        emit->Write8(op_def.simm8);
+        immToWrite = 8;
+      }
+      else
+      {
+        // mov reg, imm
+        if (!scale && op == NormalOp::MOV && bits != 64)
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op eax, imm
+        if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC)
+        {
+          emit->Write8(op_def.eaximm32);
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op r/m, imm
+        emit->Write8(op_def.imm32);
+        immToWrite = bits == 16 ? 16 : 32;
+      }
+    }
+    else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+             (operand.scale == SCALE_IMM8 && bits == 32) ||
+             (operand.scale == SCALE_IMM8 && bits == 64))
+    {
+      // op r/m, imm8
+      emit->Write8(op_def.simm8);
+      immToWrite = 8;
+    }
+    else if (operand.scale == SCALE_IMM64 && bits == 64)
+    {
+      if (scale)
+      {
+        ASSERT_MSG(DYNA_REC, 0,
+                   "WriteNormalOp - MOV with 64-bit imm requires register destination");
+      }
+      // mov reg64, imm64
+      else if (op == NormalOp::MOV)
+      {
+        // movabs reg64, imm64 (10 bytes)
+        if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset))
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          emit->Write64(operand.offset);
+          return;
+        }
+        // mov reg64, simm32 (7 bytes)
+        emit->Write8(op_def.imm32);
+        immToWrite = 32;
+      }
+      else
+      {
+        ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+      }
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+    }
+
+    // pass extension in REG of ModRM
+    _operandReg = static_cast<X64Reg>(op_def.ext);
+  }
+  else
+  {
+    _operandReg = (X64Reg)operand.offsetOrBaseReg;
+    WriteREX(emit, bits, bits, _operandReg);
+    // op r/m, reg
+    if (toRM)
+    {
+      emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32);
+    }
+    // op reg, r/m
+    else
+    {
+      emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32);
+    }
+  }
+  WriteRest(emit, immToWrite >> 3, _operandReg);
+  switch (immToWrite)
+  {
+  case 0:
+    break;
+  case 8:
+    emit->Write8((u8)operand.offset);
+    break;
+  case 16:
+    emit->Write16((u16)operand.offset);
+    break;
+  case 32:
+    emit->Write32((u32)operand.offset);
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+  }
+}
+
+void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
+{
+  if (a1.IsImm())
+  {
+    // Booh! Can't write to an imm
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+    return;
+  }
+  if (a2.IsImm())
+  {
+    a1.WriteNormalOp(this, true, op, a2, bits);
+  }
+  else
+  {
+    if (a1.IsSimpleReg())
+    {
+      a2.WriteNormalOp(this, false, op, a1, bits);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(),
+                 "WriteNormalOp - a1 and a2 cannot both be memory");
+      a1.WriteNormalOp(this, true, op, a2, bits);
+    }
+  }
+}
+
+void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADD, a1, a2);
+}
+void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADC, a1, a2);
+}
+void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SUB, a1, a2);
+}
+void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SBB, a1, a2);
+}
+void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::AND, a1, a2);
+}
+void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::OR, a1, a2);
+}
+void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::XOR, a1, a2);
+}
+void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2)
+{
+  if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 &&
+      a2.offset == static_cast<u32>(a2.offset))
+  {
+    WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32());
+    return;
+  }
+  if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+  {
+    ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+  }
+  WriteNormalOp(bits, NormalOp::MOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::TEST, a1, a2);
+}
+void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2)
+{
+  WriteNormalOp(bits, NormalOp::XCHG, a1, a2);
+}
+void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (a1.IsSimpleReg() && a2.IsZero())  // turn 'CMP reg, 0' into shorter 'TEST reg, reg'
+  {
+    WriteNormalOp(bits, NormalOp::TEST, a1, a1);
+  }
+  else
+  {
+    WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+  }
+}
+
+void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2)
+{
+  // This stomps on flags, so ensure they aren't locked
+  DEBUG_ASSERT(!flags_locked);
+
+  // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero
+  // or a2 == dest && a1 == zero)
+  if (a1.IsZero())
+  {
+    if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a2);
+    }
+    return;
+  }
+  if (a2.IsZero())
+  {
+    if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a1);
+    }
+    return;
+  }
+
+  // If dest == a1 or dest == a2 we can simplify this
+  if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a2);
+    return;
+  }
+
+  if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a1);
+    return;
+  }
+
+  // TODO: 32-bit optimizations may apply to other bit sizes (confirm)
+  if (bits == 32)
+  {
+    if (a1.IsImm() && a2.IsImm())
+    {
+      MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsImm())
+    {
+      LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsImm() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32()));
+      return;
+    }
+  }
+
+  // Fallback
+  MOV(bits, R(dest), a1);
+  ADD(bits, R(dest), a2);
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a1.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+    return;
+  }
+
+  if (!a2.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!");
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a1.WriteREX(this, bits, bits, regOp);
+
+  if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+      (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+  {
+    Write8(0x6B);
+    a1.WriteRest(this, 1, regOp);
+    Write8((u8)a2.offset);
+  }
+  else
+  {
+    Write8(0x69);
+    if (a2.GetImmBits() == 16 && bits == 16)
+    {
+      a1.WriteRest(this, 2, regOp);
+      Write16((u16)a2.offset);
+    }
+    else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+    {
+      a1.WriteRest(this, 4, regOp);
+      Write32((u32)a2.offset);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!");
+    }
+  }
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a.IsImm())
+  {
+    IMUL(bits, regOp, R(regOp), a);
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a.WriteREX(this, bits, bits, regOp);
+  Write8(0x0F);
+  Write8(0xAF);
+  a.WriteRest(this, 0, regOp);
+}
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+  if (opPrefix)
+    Write8(opPrefix);
+  arg.operandReg = regOp;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  if (op > 0xFF)
+    Write8((op >> 8) & 0xFF);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+  // Currently, only 0x38 and 0x3A are used as secondary escape byte.
+  if ((op >> 8) == 0x3A)
+    return 3;
+  else if ((op >> 8) == 0x38)
+    return 2;
+  else
+    return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+  if (opPrefix == 0x66)
+    return 1;
+  else if (opPrefix == 0xF3)
+    return 2;
+  else if (opPrefix == 0xF2)
+    return 3;
+  else
+    return 0;
+}
+
+void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  int mmmmm = GetVEXmmmmm(op);
+  int pp = GetVEXpp(opPrefix);
+  // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+  arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1);
+  Write8((u8)regOp3 << 4);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
+}
+
+void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
+{
+  if (!cpu_info.bFMA)
+    PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
+}
+
+void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           int W)
+{
+  if (!cpu_info.bFMA4)
+    PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
+}
+
+void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                          const OpArg& arg, int extrabytes)
+{
+  if (arg.IsImm())
+    PanicAlert("BMI1/2 instructions don't support immediate operands.");
+  if (size != 32 && size != 64)
+    PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!");
+  int W = size == 64;
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bBMI2)
+    PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6E, dest, arg, 0);
+}
+void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src)
+{
+  WriteSSEOp(0x66, 0x7E, src, arg, 0);
+}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+  // Alternate encoding
+  // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+  arg.operandReg = dest;
+  Write8(0x66);
+  arg.WriteREX(this, 64, 0);
+  Write8(0x0f);
+  Write8(0x6E);
+  arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+  if (src > 7 || arg.IsSimpleReg())
+  {
+    // Alternate encoding
+    // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+    arg.operandReg = src;
+    Write8(0x66);
+    arg.WriteREX(this, 64, 0);
+    Write8(0x0f);
+    Write8(0x7E);
+    arg.WriteRest(this, 0);
+  }
+  else
+  {
+    arg.operandReg = src;
+    arg.WriteREX(this, 0, 0);
+    Write8(0x66);
+    Write8(0x0f);
+    Write8(0xD6);
+    arg.WriteRest(this, 0);
+  }
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+  if (arg.IsImm() || arg.IsSimpleReg())
+    ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand");
+
+  arg.operandReg = ext;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0xAE);
+  arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 3);
+}
+void XEmitter::LDMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 2);
+}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);
+}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVNTP, regOp, arg);
+}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTP, regOp, arg);
+}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseADD, regOp, arg);
+}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseADD, regOp, arg);
+}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSUB, regOp, arg);
+}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSUB, regOp, arg);
+}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF3, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF2, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMUL, regOp, arg);
+}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMUL, regOp, arg);
+}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseDIV, regOp, arg);
+}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseDIV, regOp, arg);
+}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMIN, regOp, arg);
+}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMIN, regOp, arg);
+}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMAX, regOp, arg);
+}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRSQRT, regOp, arg);
+}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseADD, regOp, arg);
+}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseADD, regOp, arg);
+}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSUB, regOp, arg);
+}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSUB, regOp, arg);
+}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x00, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x66, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseAND, regOp, arg);
+}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseAND, regOp, arg);
+}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseANDN, regOp, arg);
+}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseANDN, regOp, arg);
+}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseOR, regOp, arg);
+}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseOR, regOp, arg);
+}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseXOR, regOp, arg);
+}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseXOR, regOp, arg);
+}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMUL, regOp, arg);
+}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMUL, regOp, arg);
+}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseDIV, regOp, arg);
+}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseDIV, regOp, arg);
+}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMIN, regOp, arg);
+}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMIN, regOp, arg);
+}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMAX, regOp, arg);
+}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRSQRT, regOp, arg);
+}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x00, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseCOMIS, regOp, arg);
+}  // weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseCOMIS, regOp, arg);
+}  // ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseUCOMIS, regOp, arg);
+}  // unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseUCOMIS, regOp, arg);
+}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);
+}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);
+}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);
+}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg);
+}
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg);
+}
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));
+}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));
+}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5A, regOp, arg);
+}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5A, regOp, arg);
+}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2A, regOp, arg);
+}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2A, regOp, arg);
+}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0xE6, regOp, arg);
+}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5B, regOp, arg);
+}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0xE6, regOp, arg);
+}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5B, regOp, arg);
+}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5B, regOp, arg);
+}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE6, regOp, arg);
+}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)
+{
+  WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));
+}
+
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x50, dest, arg);
+}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x50, dest, arg);
+}
+
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseLDDQU, dest, arg);
+}  // For integer data only
+
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x15, dest, arg);
+}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x15, dest, arg);
+}
+
+// Pretty much every x86 CPU nowadays supports SSE3,
+// but the SSE2 fallbacks are easy.
+void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKLPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x16, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKHPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF2, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVSD(regOp, arg);
+    UNPCKLPD(regOp, R(regOp));
+  }
+}
+
+// There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6B, dest, arg);
+}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x63, dest, arg);
+}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x67, dest, arg);
+}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x60, dest, arg);
+}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x61, dest, arg);
+}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x62, dest, arg);
+}
+void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6C, dest, arg);
+}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAW-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x71);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAD-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x72);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSSE3)
+    PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSE4_1)
+    PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSSE3Op(0x66, 0x3800, dest, arg);
+}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3817, dest, arg);
+}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x382b, dest, arg);
+}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3820, dest, arg);
+}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3821, dest, arg);
+}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3822, dest, arg);
+}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3823, dest, arg);
+}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3824, dest, arg);
+}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3825, dest, arg);
+}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3830, dest, arg);
+}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3831, dest, arg);
+}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3832, dest, arg);
+}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3833, dest, arg);
+}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3834, dest, arg);
+}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3835, dest, arg);
+}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3810, dest, arg);
+}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3814, dest, arg);
+}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3815, dest, arg);
+}
+void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1);
+  Write8(blend);
+}
+void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1);
+  Write8(blend);
+}
+
+void XEmitter::PAND(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDB, dest, arg);
+}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDF, dest, arg);
+}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEF, dest, arg);
+}
+void XEmitter::POR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEB, dest, arg);
+}
+
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFC, dest, arg);
+}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFD, dest, arg);
+}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFE, dest, arg);
+}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD4, dest, arg);
+}
+
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEC, dest, arg);
+}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xED, dest, arg);
+}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDC, dest, arg);
+}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDD, dest, arg);
+}
+
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF8, dest, arg);
+}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF9, dest, arg);
+}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFA, dest, arg);
+}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFB, dest, arg);
+}
+
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE8, dest, arg);
+}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE9, dest, arg);
+}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD8, dest, arg);
+}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD9, dest, arg);
+}
+
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE0, dest, arg);
+}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE3, dest, arg);
+}
+
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x74, dest, arg);
+}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x75, dest, arg);
+}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x76, dest, arg);
+}
+
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x64, dest, arg);
+}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x65, dest, arg);
+}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x66, dest, arg);
+}
+
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC5, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC4, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSE41Op(0x66, 0x3A22, dest, arg);
+  Write8(subreg);
+}
+
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF5, dest, arg);
+}
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF6, dest, arg);
+}
+
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEE, dest, arg);
+}
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDE, dest, arg);
+}
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEA, dest, arg);
+}
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDA, dest, arg);
+}
+
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD7, dest, arg);
+}
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF2, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF3, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+// VEX
+void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);
+}
+void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare)
+{
+  WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1);
+  Write8(compare);
+}
+void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);
+}
+void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3)
+{
+  WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3);
+}
+void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);
+}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);
+}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);
+}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);
+}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
+}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);
+}
+
+#define FMA4(name, op)                                                                             \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);                                                 \
+  }                                                                                                \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);                                                 \
+  }
+
+FMA4(VFMADDSUBPS, 0x5C)
+FMA4(VFMADDSUBPD, 0x5D)
+FMA4(VFMSUBADDPS, 0x5E)
+FMA4(VFMSUBADDPD, 0x5F)
+FMA4(VFMADDPS, 0x68)
+FMA4(VFMADDPD, 0x69)
+FMA4(VFMADDSS, 0x6A)
+FMA4(VFMADDSD, 0x6B)
+FMA4(VFMSUBPS, 0x6C)
+FMA4(VFMSUBPD, 0x6D)
+FMA4(VFMSUBSS, 0x6E)
+FMA4(VFMSUBSD, 0x6F)
+FMA4(VFNMADDPS, 0x78)
+FMA4(VFNMADDPD, 0x79)
+FMA4(VFNMADDSS, 0x7A)
+FMA4(VFNMADDSD, 0x7B)
+FMA4(VFNMSUBPS, 0x7C)
+FMA4(VFNMSUBPD, 0x7D)
+FMA4(VFNMSUBSS, 0x7E)
+FMA4(VFNMSUBSD, 0x7F)
+#undef FMA4
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate)
+{
+  WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1);
+  Write8(rotate);
+}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);
+}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  CheckFlags();
+  WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);
+}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);
+}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);
+}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);
+}
+
+// Prefixes
+
+void XEmitter::LOCK()
+{
+  Write8(0xF0);
+}
+void XEmitter::REP()
+{
+  Write8(0xF3);
+}
+void XEmitter::REPNE()
+{
+  Write8(0xF2);
+}
+void XEmitter::FSOverride()
+{
+  Write8(0x64);
+}
+void XEmitter::GSOverride()
+{
+  Write8(0x65);
+}
+
+void XEmitter::FWAIT()
+{
+  Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
+{
+  int mf = 0;
+  ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid),
+             "WriteFloatLoadStore: 80 bits not supported for this instruction");
+  switch (bits)
+  {
+  case 32:
+    mf = 0;
+    break;
+  case 64:
+    mf = 4;
+    break;
+  case 80:
+    mf = 2;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+  }
+  Write8(0xd9 | mf);
+  // x87 instructions use the reg field of the ModR/M byte as opcode:
+  if (bits == 80)
+    op = op_80b;
+  arg.WriteRest(this, 0, static_cast<X64Reg>(op));
+}
+
+void XEmitter::FLD(int bits, const OpArg& src)
+{
+  WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src);
+}
+void XEmitter::FST(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest);
+}
+void XEmitter::FSTP(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest);
+}
+void XEmitter::FNSTSW_AX()
+{
+  Write8(0xDF);
+  Write8(0xE0);
+}
+
+void XEmitter::RDTSC()
+{
+  Write8(0x0F);
+  Write8(0x31);
+}
+}
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
new file mode 100644
index 0000000..122850d
--- /dev/null
+++ b/src/dolphin/x64Emitter.h
@@ -0,0 +1,1180 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <type_traits>
+
+#include "Assert.h"
+#include "BitSet.h"
+#include "CodeBlock.h"
+#include "../types.h"
+#include "x64ABI.h"
+
+namespace Gen
+{
+enum CCFlags
+{
+  CC_O = 0,
+  CC_NO = 1,
+  CC_B = 2,
+  CC_C = 2,
+  CC_NAE = 2,
+  CC_NB = 3,
+  CC_NC = 3,
+  CC_AE = 3,
+  CC_Z = 4,
+  CC_E = 4,
+  CC_NZ = 5,
+  CC_NE = 5,
+  CC_BE = 6,
+  CC_NA = 6,
+  CC_NBE = 7,
+  CC_A = 7,
+  CC_S = 8,
+  CC_NS = 9,
+  CC_P = 0xA,
+  CC_PE = 0xA,
+  CC_NP = 0xB,
+  CC_PO = 0xB,
+  CC_L = 0xC,
+  CC_NGE = 0xC,
+  CC_NL = 0xD,
+  CC_GE = 0xD,
+  CC_LE = 0xE,
+  CC_NG = 0xE,
+  CC_NLE = 0xF,
+  CC_G = 0xF
+};
+
+enum
+{
+  NUMGPRs = 16,
+  NUMXMMs = 16,
+};
+
+enum
+{
+  SCALE_NONE = 0,
+  SCALE_1 = 1,
+  SCALE_2 = 2,
+  SCALE_4 = 4,
+  SCALE_8 = 8,
+  SCALE_ATREG = 16,
+  // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+  SCALE_NOBASE_2 = 34,
+  SCALE_NOBASE_4 = 36,
+  SCALE_NOBASE_8 = 40,
+  SCALE_RIP = 0xFF,
+  SCALE_IMM8 = 0xF0,
+  SCALE_IMM16 = 0xF1,
+  SCALE_IMM32 = 0xF2,
+  SCALE_IMM64 = 0xF3,
+};
+
+enum SSECompare
+{
+  CMP_EQ = 0,
+  CMP_LT = 1,
+  CMP_LE = 2,
+  CMP_UNORD = 3,
+  CMP_NEQ = 4,
+  CMP_NLT = 5,
+  CMP_NLE = 6,
+  CMP_ORD = 7,
+};
+
+class XEmitter;
+enum class FloatOp;
+enum class NormalOp;
+
+// Information about a generated MOV op
+struct MovInfo final
+{
+  u8* address;
+  bool nonAtomicSwapStore;
+  // valid iff nonAtomicSwapStore is true
+  X64Reg nonAtomicSwapStoreSrc;
+};
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+  // For accessing offset and operandReg.
+  // This also allows us to keep the op writing functions private.
+  friend class XEmitter;
+
+  // dummy op arg, used for storage
+  constexpr OpArg() = default;
+  constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX)
+      : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)},
+        indexReg{static_cast<u16>(scaled_reg)}, offset{offset_}
+  {
+  }
+  constexpr bool operator==(const OpArg& b) const
+  {
+    // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately,
+    // (because we still support some older versions of GCC where std::tie is not constexpr.)
+    return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+           indexReg == b.indexReg && offset == b.offset;
+  }
+  constexpr bool operator!=(const OpArg& b) const { return !operator==(b); }
+  u64 Imm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (u64)offset;
+  }
+  u32 Imm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (u32)offset;
+  }
+  u16 Imm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (u16)offset;
+  }
+  u8 Imm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (u8)offset;
+  }
+
+  s64 SImm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (s64)offset;
+  }
+  s32 SImm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (s32)offset;
+  }
+  s16 SImm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (s16)offset;
+  }
+  s8 SImm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (s8)offset;
+  }
+
+  OpArg AsImm64() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u64)offset, SCALE_IMM64);
+  }
+  OpArg AsImm32() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u32)offset, SCALE_IMM32);
+  }
+  OpArg AsImm16() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u16)offset, SCALE_IMM16);
+  }
+  OpArg AsImm8() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u8)offset, SCALE_IMM8);
+  }
+
+  constexpr bool IsImm() const
+  {
+    return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+           scale == SCALE_IMM64;
+  }
+  constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+  constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; }
+  constexpr bool IsZero() const { return IsImm() && offset == 0; }
+  constexpr int GetImmBits() const
+  {
+    switch (scale)
+    {
+    case SCALE_IMM8:
+      return 8;
+    case SCALE_IMM16:
+      return 16;
+    case SCALE_IMM32:
+      return 32;
+    case SCALE_IMM64:
+      return 64;
+    default:
+      return -1;
+    }
+  }
+
+  constexpr X64Reg GetSimpleReg() const
+  {
+    if (scale == SCALE_NONE)
+      return static_cast<X64Reg>(offsetOrBaseReg);
+
+    return INVALID_REG;
+  }
+
+  void AddMemOffset(int val)
+  {
+    DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE),
+                     "Tried to increment an OpArg which doesn't have an offset");
+    offset += val;
+  }
+
+private:
+  void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+  void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                int W = 0) const;
+  void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+                 bool warn_64bit_offset = true) const;
+  void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+  void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const;
+
+  u8 scale = 0;
+  u16 offsetOrBaseReg = 0;
+  u16 indexReg = 0;
+  u64 offset = 0;  // Also used to store immediates.
+  u16 operandReg = 0;
+};
+
+template <typename T>
+inline OpArg M(const T* ptr)
+{
+  return OpArg((u64)(const void*)ptr, (int)SCALE_RIP);
+}
+constexpr OpArg R(X64Reg value)
+{
+  return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value)
+{
+  return OpArg(0, SCALE_ATREG, value);
+}
+
+constexpr OpArg MDisp(X64Reg value, int offset)
+{
+  return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
+}
+
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+  return OpArg(offset, scale, base, scaled);
+}
+
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+  if (scale == SCALE_1)
+    return OpArg(offset, SCALE_ATREG, scaled);
+
+  return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+  return MComplex(base, offset, 1, 0);
+}
+
+constexpr OpArg Imm8(u8 imm)
+{
+  return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm)
+{
+  return OpArg(imm, SCALE_IMM16);
+}  // rarely used
+constexpr OpArg Imm32(u32 imm)
+{
+  return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm)
+{
+  return OpArg(imm, SCALE_IMM64);
+}
+inline OpArg ImmPtr(const void* imm)
+{
+  return Imm64(reinterpret_cast<u64>(imm));
+}
+
+inline u32 PtrOffset(const void* ptr, const void* base = nullptr)
+{
+  s64 distance = (s64)ptr - (s64)base;
+  if (distance >= 0x80000000LL || distance < -0x80000000LL)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range");
+    return 0;
+  }
+
+  return (u32)distance;
+}
+
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
+
+struct FixupBranch
+{
+  enum class Type
+  {
+    Branch8Bit,
+    Branch32Bit
+  };
+
+  u8* ptr;
+  Type type;
+};
+
+class XEmitter
+{
+  friend struct OpArg;  // for Write8 etc
+private:
+  u8* code = nullptr;
+  bool flags_locked = false;
+
+  void CheckFlags();
+
+  void Rex(int w, int r, int x, int b);
+  void WriteModRM(int mod, int reg, int rm);
+  void WriteSIB(int scale, int index, int base);
+  void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+  void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+  void WriteMulDivType(int bits, OpArg src, int ext);
+  void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+  void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+  void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
+  void WriteMXCSR(OpArg arg, int ext);
+  void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+  void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                  int extrabytes = 0);
+  void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg);
+  void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+  void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+
+  void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                              size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+  void Write8(u8 value);
+  void Write16(u16 value);
+  void Write32(u32 value);
+  void Write64(u64 value);
+
+public:
+  XEmitter() = default;
+  explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
+  virtual ~XEmitter() = default;
+  void SetCodePtr(u8* ptr);
+  void ReserveCodeSpace(int bytes);
+  u8* AlignCodeTo(size_t alignment);
+  u8* AlignCode4();
+  u8* AlignCode16();
+  u8* AlignCodePage();
+  const u8* GetCodePtr() const;
+  u8* GetWritableCodePtr();
+
+  void LockFlags() { flags_locked = true; }
+  void UnlockFlags() { flags_locked = false; }
+  // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+  // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+  // string instr.,
+  // INC and DEC are slow on Intel Core, but not on AMD. They create a
+  // false flag dependency because they only update a subset of the flags.
+  // XCHG is SLOW and should be avoided.
+
+  // Debug breakpoint
+  void INT3();
+
+  // Do nothing
+  void NOP(size_t count = 1);
+
+  // Save energy in wait-loops on P4 only. Probably not too useful.
+  void PAUSE();
+
+  // Flag control
+  void STC();
+  void CLC();
+  void CMC();
+
+  // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+  // AMD!
+  void LAHF();  // 3 cycle vector path
+  void SAHF();  // direct path fast
+
+  // Stack control
+  void PUSH(X64Reg reg);
+  void POP(X64Reg reg);
+  void PUSH(int bits, const OpArg& reg);
+  void POP(int bits, const OpArg& reg);
+  void PUSHF();
+  void POPF();
+
+  // Flow control
+  void RET();
+  void RET_FAST();
+  void UD2();
+  FixupBranch J(bool force5bytes = false);
+
+  void JMP(const u8* addr, bool force5Bytes = false);
+  void JMPptr(const OpArg& arg);
+  void JMPself();  // infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+  void CALL(const void* fnptr);
+  FixupBranch CALL();
+  void CALLptr(OpArg arg);
+
+  FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+  void J_CC(CCFlags conditionCode, const u8* addr);
+
+  void SetJumpTarget(const FixupBranch& branch);
+
+  void SETcc(CCFlags flag, OpArg dest);
+  // Note: CMOV brings small if any benefit on current CPUs.
+  void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+  // Fences
+  void LFENCE();
+  void MFENCE();
+  void SFENCE();
+
+  // Bit scan
+  void BSF(int bits, X64Reg dest, const OpArg& src);  // Bottom bit to top bit
+  void BSR(int bits, X64Reg dest, const OpArg& src);  // Top bit to bottom bit
+
+  // Cache control
+  enum PrefetchLevel
+  {
+    PF_NTA,  // Non-temporal (data used once and only once)
+    PF_T0,   // All cache levels
+    PF_T1,   // Levels 2+ (aliased to T0 on AMD)
+    PF_T2,   // Levels 3+ (aliased to T0 on AMD)
+  };
+  void PREFETCH(PrefetchLevel level, OpArg arg);
+  void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+  void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+  void MOVNTPS(const OpArg& arg, X64Reg regOp);
+  void MOVNTPD(const OpArg& arg, X64Reg regOp);
+
+  // Multiplication / division
+  void MUL(int bits, const OpArg& src);   // UNSIGNED
+  void IMUL(int bits, const OpArg& src);  // SIGNED
+  void IMUL(int bits, X64Reg regOp, const OpArg& src);
+  void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+  void DIV(int bits, const OpArg& src);
+  void IDIV(int bits, const OpArg& src);
+
+  // Shift
+  void ROL(int bits, const OpArg& dest, const OpArg& shift);
+  void ROR_(int bits, const OpArg& dest, const OpArg& shift);
+  void RCL(int bits, const OpArg& dest, const OpArg& shift);
+  void RCR(int bits, const OpArg& dest, const OpArg& shift);
+  void SHL(int bits, const OpArg& dest, const OpArg& shift);
+  void SHR(int bits, const OpArg& dest, const OpArg& shift);
+  void SAR(int bits, const OpArg& dest, const OpArg& shift);
+
+  // Bit Test
+  void BT(int bits, const OpArg& dest, const OpArg& index);
+  void BTS(int bits, const OpArg& dest, const OpArg& index);
+  void BTR(int bits, const OpArg& dest, const OpArg& index);
+  void BTC(int bits, const OpArg& dest, const OpArg& index);
+
+  // Double-Precision Shift
+  void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+  void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+
+  // Extend EAX into EDX in various ways
+  void CWD(int bits = 16);
+  inline void CDQ() { CWD(32); }
+  inline void CQO() { CWD(64); }
+  void CBW(int bits = 8);
+  inline void CWDE() { CBW(16); }
+  inline void CDQE() { CBW(32); }
+  // Load effective address
+  void LEA(int bits, X64Reg dest, OpArg src);
+
+  // Integer arithmetic
+  void NEG(int bits, const OpArg& src);
+  void ADD(int bits, const OpArg& a1, const OpArg& a2);
+  void ADC(int bits, const OpArg& a1, const OpArg& a2);
+  void SUB(int bits, const OpArg& a1, const OpArg& a2);
+  void SBB(int bits, const OpArg& a1, const OpArg& a2);
+  void AND(int bits, const OpArg& a1, const OpArg& a2);
+  void CMP(int bits, const OpArg& a1, const OpArg& a2);
+
+  // Bit operations
+  void NOT(int bits, const OpArg& src);
+  void OR(int bits, const OpArg& a1, const OpArg& a2);
+  void XOR(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV(int bits, const OpArg& a1, const OpArg& a2);
+  void TEST(int bits, const OpArg& a1, const OpArg& a2);
+
+  void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2);
+
+  // Are these useful at all? Consider removing.
+  void XCHG(int bits, const OpArg& a1, const OpArg& a2);
+  void XCHG_AHAL();
+
+  // Byte swapping (32 and 64-bit only).
+  void BSWAP(int bits, X64Reg reg);
+
+  // Sign/zero extension
+  void MOVSX(int dbits, int sbits, X64Reg dest,
+             OpArg src);  // automatically uses MOVSXD if necessary
+  void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+  // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+  void MOVBE(int bits, X64Reg dest, const OpArg& src);
+  void MOVBE(int bits, const OpArg& dest, X64Reg src);
+  void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false,
+                   MovInfo* info = nullptr);
+  void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr);
+
+  // Available only on AMD >= Phenom or Intel >= Haswell
+  void LZCNT(int bits, X64Reg dest, const OpArg& src);
+  // Note: this one is actually part of BMI1
+  void TZCNT(int bits, X64Reg dest, const OpArg& src);
+
+  // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+  void STMXCSR(const OpArg& memloc);
+  void LDMXCSR(const OpArg& memloc);
+
+  // Prefixes
+  void LOCK();
+  void REP();
+  void REPNE();
+  void FSOverride();
+  void GSOverride();
+
+  // x87
+  enum x87StatusWordBits
+  {
+    x87_InvalidOperation = 0x1,
+    x87_DenormalizedOperand = 0x2,
+    x87_DivisionByZero = 0x4,
+    x87_Overflow = 0x8,
+    x87_Underflow = 0x10,
+    x87_Precision = 0x20,
+    x87_StackFault = 0x40,
+    x87_ErrorSummary = 0x80,
+    x87_C0 = 0x100,
+    x87_C1 = 0x200,
+    x87_C2 = 0x400,
+    x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+    x87_C3 = 0x4000,
+    x87_FPUBusy = 0x8000,
+  };
+
+  void FLD(int bits, const OpArg& src);
+  void FST(int bits, const OpArg& dest);
+  void FSTP(int bits, const OpArg& dest);
+  void FNSTSW_AX();
+  void FWAIT();
+
+  // SSE/SSE2: Floating point arithmetic
+  void ADDSS(X64Reg regOp, const OpArg& arg);
+  void ADDSD(X64Reg regOp, const OpArg& arg);
+  void SUBSS(X64Reg regOp, const OpArg& arg);
+  void SUBSD(X64Reg regOp, const OpArg& arg);
+  void MULSS(X64Reg regOp, const OpArg& arg);
+  void MULSD(X64Reg regOp, const OpArg& arg);
+  void DIVSS(X64Reg regOp, const OpArg& arg);
+  void DIVSD(X64Reg regOp, const OpArg& arg);
+  void MINSS(X64Reg regOp, const OpArg& arg);
+  void MINSD(X64Reg regOp, const OpArg& arg);
+  void MAXSS(X64Reg regOp, const OpArg& arg);
+  void MAXSD(X64Reg regOp, const OpArg& arg);
+  void SQRTSS(X64Reg regOp, const OpArg& arg);
+  void SQRTSD(X64Reg regOp, const OpArg& arg);
+  void RCPSS(X64Reg regOp, const OpArg& arg);
+  void RSQRTSS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point bitwise (yes)
+  void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
+
+  // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+  void ADDPS(X64Reg regOp, const OpArg& arg);
+  void ADDPD(X64Reg regOp, const OpArg& arg);
+  void SUBPS(X64Reg regOp, const OpArg& arg);
+  void SUBPD(X64Reg regOp, const OpArg& arg);
+  void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+  void MULPS(X64Reg regOp, const OpArg& arg);
+  void MULPD(X64Reg regOp, const OpArg& arg);
+  void DIVPS(X64Reg regOp, const OpArg& arg);
+  void DIVPD(X64Reg regOp, const OpArg& arg);
+  void MINPS(X64Reg regOp, const OpArg& arg);
+  void MINPD(X64Reg regOp, const OpArg& arg);
+  void MAXPS(X64Reg regOp, const OpArg& arg);
+  void MAXPD(X64Reg regOp, const OpArg& arg);
+  void SQRTPS(X64Reg regOp, const OpArg& arg);
+  void SQRTPD(X64Reg regOp, const OpArg& arg);
+  void RCPPS(X64Reg regOp, const OpArg& arg);
+  void RSQRTPS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+  void ANDPS(X64Reg regOp, const OpArg& arg);
+  void ANDPD(X64Reg regOp, const OpArg& arg);
+  void ANDNPS(X64Reg regOp, const OpArg& arg);
+  void ANDNPD(X64Reg regOp, const OpArg& arg);
+  void ORPS(X64Reg regOp, const OpArg& arg);
+  void ORPD(X64Reg regOp, const OpArg& arg);
+  void XORPS(X64Reg regOp, const OpArg& arg);
+  void XORPD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+  void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+  void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
+
+  // SSE3
+  void MOVSLDUP(X64Reg regOp, const OpArg& arg);
+  void MOVSHDUP(X64Reg regOp, const OpArg& arg);
+  void MOVDDUP(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Useful alternative to shuffle in some cases.
+  void UNPCKLPS(X64Reg dest, const OpArg& src);
+  void UNPCKHPS(X64Reg dest, const OpArg& src);
+  void UNPCKLPD(X64Reg dest, const OpArg& src);
+  void UNPCKHPD(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Compares.
+  void COMISS(X64Reg regOp, const OpArg& arg);
+  void COMISD(X64Reg regOp, const OpArg& arg);
+  void UCOMISS(X64Reg regOp, const OpArg& arg);
+  void UCOMISD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+  void MOVAPS(X64Reg regOp, const OpArg& arg);
+  void MOVAPD(X64Reg regOp, const OpArg& arg);
+  void MOVAPS(const OpArg& arg, X64Reg regOp);
+  void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVUPS(X64Reg regOp, const OpArg& arg);
+  void MOVUPD(X64Reg regOp, const OpArg& arg);
+  void MOVUPS(const OpArg& arg, X64Reg regOp);
+  void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVDQA(X64Reg regOp, const OpArg& arg);
+  void MOVDQA(const OpArg& arg, X64Reg regOp);
+  void MOVDQU(X64Reg regOp, const OpArg& arg);
+  void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+  void MOVSS(X64Reg regOp, const OpArg& arg);
+  void MOVSD(X64Reg regOp, const OpArg& arg);
+  void MOVSS(const OpArg& arg, X64Reg regOp);
+  void MOVSD(const OpArg& arg, X64Reg regOp);
+
+  void MOVLPS(X64Reg regOp, const OpArg& arg);
+  void MOVLPD(X64Reg regOp, const OpArg& arg);
+  void MOVLPS(const OpArg& arg, X64Reg regOp);
+  void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHPS(X64Reg regOp, const OpArg& arg);
+  void MOVHPD(X64Reg regOp, const OpArg& arg);
+  void MOVHPS(const OpArg& arg, X64Reg regOp);
+  void MOVHPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+  void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+  // Be careful when using these overloads for reg <--> xmm moves.
+  // The one you cast to OpArg with R(reg) is the x86 reg, the other
+  // one is the xmm reg.
+  // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx)
+  //     use "MOVD_xmm(R(eax), xmm1)" instead.
+  void MOVD_xmm(X64Reg dest, const OpArg& arg);
+  void MOVQ_xmm(X64Reg dest, OpArg arg);
+  void MOVD_xmm(const OpArg& arg, X64Reg src);
+  void MOVQ_xmm(OpArg arg, X64Reg src);
+
+  // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+  // question.
+  void MOVMSKPS(X64Reg dest, const OpArg& arg);
+  void MOVMSKPD(X64Reg dest, const OpArg& arg);
+
+  // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+  // weird one.
+  void MASKMOVDQU(X64Reg dest, X64Reg src);
+  void LDDQU(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Data type conversions.
+  void CVTPS2PD(X64Reg dest, const OpArg& src);
+  void CVTPD2PS(X64Reg dest, const OpArg& src);
+  void CVTSS2SD(X64Reg dest, const OpArg& src);
+  void CVTSI2SS(X64Reg dest, const OpArg& src);
+  void CVTSD2SS(X64Reg dest, const OpArg& src);
+  void CVTSI2SD(X64Reg dest, const OpArg& src);
+  void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+  void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+  void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+  void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
+
+  // Destinations are X64 regs (rax, rbx, ...) for these instructions.
+  void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+  void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+  void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+  void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
+
+  // SSE2: Packed integer instructions
+  void PACKSSDW(X64Reg dest, const OpArg& arg);
+  void PACKSSWB(X64Reg dest, const OpArg& arg);
+  void PACKUSDW(X64Reg dest, const OpArg& arg);
+  void PACKUSWB(X64Reg dest, const OpArg& arg);
+
+  void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+  void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+  void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+  void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
+
+  void PTEST(X64Reg dest, const OpArg& arg);
+  void PAND(X64Reg dest, const OpArg& arg);
+  void PANDN(X64Reg dest, const OpArg& arg);
+  void PXOR(X64Reg dest, const OpArg& arg);
+  void POR(X64Reg dest, const OpArg& arg);
+
+  void PADDB(X64Reg dest, const OpArg& arg);
+  void PADDW(X64Reg dest, const OpArg& arg);
+  void PADDD(X64Reg dest, const OpArg& arg);
+  void PADDQ(X64Reg dest, const OpArg& arg);
+
+  void PADDSB(X64Reg dest, const OpArg& arg);
+  void PADDSW(X64Reg dest, const OpArg& arg);
+  void PADDUSB(X64Reg dest, const OpArg& arg);
+  void PADDUSW(X64Reg dest, const OpArg& arg);
+
+  void PSUBB(X64Reg dest, const OpArg& arg);
+  void PSUBW(X64Reg dest, const OpArg& arg);
+  void PSUBD(X64Reg dest, const OpArg& arg);
+  void PSUBQ(X64Reg dest, const OpArg& arg);
+
+  void PSUBSB(X64Reg dest, const OpArg& arg);
+  void PSUBSW(X64Reg dest, const OpArg& arg);
+  void PSUBUSB(X64Reg dest, const OpArg& arg);
+  void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+  void PAVGB(X64Reg dest, const OpArg& arg);
+  void PAVGW(X64Reg dest, const OpArg& arg);
+
+  void PCMPEQB(X64Reg dest, const OpArg& arg);
+  void PCMPEQW(X64Reg dest, const OpArg& arg);
+  void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+  void PCMPGTB(X64Reg dest, const OpArg& arg);
+  void PCMPGTW(X64Reg dest, const OpArg& arg);
+  void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+  void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
+
+  void PMADDWD(X64Reg dest, const OpArg& arg);
+  void PSADBW(X64Reg dest, const OpArg& arg);
+
+  void PMAXSW(X64Reg dest, const OpArg& arg);
+  void PMAXUB(X64Reg dest, const OpArg& arg);
+  void PMINSW(X64Reg dest, const OpArg& arg);
+  void PMINUB(X64Reg dest, const OpArg& arg);
+
+  void PMOVMSKB(X64Reg dest, const OpArg& arg);
+  void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFB(X64Reg dest, const OpArg& arg);
+
+  void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
+
+  void PSRLW(X64Reg reg, int shift);
+  void PSRLD(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, const OpArg& arg);
+  void PSRLDQ(X64Reg reg, int shift);
+
+  void PSLLW(X64Reg reg, int shift);
+  void PSLLD(X64Reg reg, int shift);
+  void PSLLQ(X64Reg reg, int shift);
+  void PSLLDQ(X64Reg reg, int shift);
+
+  void PSRAW(X64Reg reg, int shift);
+  void PSRAD(X64Reg reg, int shift);
+
+  // SSE4: data type conversions
+  void PMOVSXBW(X64Reg dest, const OpArg& arg);
+  void PMOVSXBD(X64Reg dest, const OpArg& arg);
+  void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXWD(X64Reg dest, const OpArg& arg);
+  void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXBW(X64Reg dest, const OpArg& arg);
+  void PMOVZXBD(X64Reg dest, const OpArg& arg);
+  void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXWD(X64Reg dest, const OpArg& arg);
+  void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXDQ(X64Reg dest, const OpArg& arg);
+
+  // SSE4: blend instructions
+  void PBLENDVB(X64Reg dest, const OpArg& arg);
+  void BLENDVPS(X64Reg dest, const OpArg& arg);
+  void BLENDVPD(X64Reg dest, const OpArg& arg);
+  void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
+  void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
+
+  // AVX
+  void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare);
+  void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask);
+  void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+  void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+
+  void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  // FMA3
+  void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+#define FMA4(name)                                                                                 \
+  void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);                          \
+  void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+
+  FMA4(VFMADDSUBPS)
+  FMA4(VFMADDSUBPD)
+  FMA4(VFMSUBADDPS)
+  FMA4(VFMSUBADDPD)
+  FMA4(VFMADDPS)
+  FMA4(VFMADDPD)
+  FMA4(VFMADDSS)
+  FMA4(VFMADDSD)
+  FMA4(VFMSUBPS)
+  FMA4(VFMSUBPD)
+  FMA4(VFMSUBSS)
+  FMA4(VFMSUBSD)
+  FMA4(VFNMADDPS)
+  FMA4(VFNMADDPD)
+  FMA4(VFNMADDSS)
+  FMA4(VFNMADDSD)
+  FMA4(VFNMSUBPS)
+  FMA4(VFNMSUBPD)
+  FMA4(VFNMSUBSS)
+  FMA4(VFNMSUBSD)
+#undef FMA4
+
+  // VEX GPR instructions
+  void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+  void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+  void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void RDTSC();
+
+  // Utility functions
+  // The difference between this and CALL is that this aligns the stack
+  // where appropriate.
+  template <typename FunctionPointer>
+  void ABI_CallFunction(FunctionPointer func)
+  {
+    static_assert(std::is_pointer<FunctionPointer>() &&
+                      std::is_function<std::remove_pointer_t<FunctionPointer>>(),
+                  "Supplied type must be a function pointer.");
+
+    const void* ptr = reinterpret_cast<const void*>(func);
+    const u64 address = reinterpret_cast<u64>(ptr);
+    const u64 distance = address - (reinterpret_cast<u64>(code) + 5);
+
+    if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL)
+    {
+      // Far call
+      MOV(64, R(RAX), Imm64(address));
+      CALLptr(R(RAX));
+    }
+    else
+    {
+      CALL(ptr);
+    }
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC16(FunctionPointer func, u16 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC(FunctionPointer func, u32 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3,
+                            const void* param4)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  // Pass a register as a parameter.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1)
+  {
+    if (reg1 != ABI_PARAM1)
+      MOV(32, R(ABI_PARAM1), R(reg1));
+    ABI_CallFunction(func);
+  }
+
+  // Pass two registers as parameters.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2)
+  {
+    MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2);
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    ABI_CallFunction(func);
+  }
+
+  // Helper method for ABI functions related to calling functions. May be used by itself as well.
+  void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2);
+
+  // Saves/restores the registers and adjusts the stack to be aligned as
+  // required by the ABI, where the previous alignment was as specified.
+  // Push returns the size of the shadow space, i.e. the offset of the frame.
+  size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                         size_t needed_frame_size = 0);
+  void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                      size_t needed_frame_size = 0);
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  template <typename T, typename... Args>
+  void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+  {
+    auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>;
+    ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1);
+  }
+};  // class XEmitter
+
+class X64CodeBlock : public Common::CodeBlock<XEmitter>
+{
+private:
+  void PoisonMemory() override
+  {
+    // x86/64: 0xCC = breakpoint
+    memset(region, 0xCC, region_size);
+  }
+};
+
+}  // namespace
diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h
new file mode 100644
index 0000000..a92e024
--- /dev/null
+++ b/src/dolphin/x64Reg.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+namespace Gen
+{
+enum X64Reg
+{
+  EAX = 0,
+  EBX = 3,
+  ECX = 1,
+  EDX = 2,
+  ESI = 6,
+  EDI = 7,
+  EBP = 5,
+  ESP = 4,
+
+  RAX = 0,
+  RBX = 3,
+  RCX = 1,
+  RDX = 2,
+  RSI = 6,
+  RDI = 7,
+  RBP = 5,
+  RSP = 4,
+  R8 = 8,
+  R9 = 9,
+  R10 = 10,
+  R11 = 11,
+  R12 = 12,
+  R13 = 13,
+  R14 = 14,
+  R15 = 15,
+
+  AL = 0,
+  BL = 3,
+  CL = 1,
+  DL = 2,
+  SIL = 6,
+  DIL = 7,
+  BPL = 5,
+  SPL = 4,
+  AH = 0x104,
+  BH = 0x107,
+  CH = 0x105,
+  DH = 0x106,
+
+  AX = 0,
+  BX = 3,
+  CX = 1,
+  DX = 2,
+  SI = 6,
+  DI = 7,
+  BP = 5,
+  SP = 4,
+
+  XMM0 = 0,
+  XMM1,
+  XMM2,
+  XMM3,
+  XMM4,
+  XMM5,
+  XMM6,
+  XMM7,
+  XMM8,
+  XMM9,
+  XMM10,
+  XMM11,
+  XMM12,
+  XMM13,
+  XMM14,
+  XMM15,
+
+  YMM0 = 0,
+  YMM1,
+  YMM2,
+  YMM3,
+  YMM4,
+  YMM5,
+  YMM6,
+  YMM7,
+  YMM8,
+  YMM9,
+  YMM10,
+  YMM11,
+  YMM12,
+  YMM13,
+  YMM14,
+  YMM15,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+}  // namespace Gen
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index c50f216..0ccaed7 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -38,6 +38,8 @@ uiWindow* win;
 
 uiCheckbox* cbDirectBoot;
 
+uiCheckbox* cbJITEnabled;
+uiEntry* enJITMaxBlockSize;
 
 int OnCloseWindow(uiWindow* window, void* blarg)
 {
@@ -61,6 +63,14 @@ void OnOk(uiButton* btn, void* blarg)
     opened = false;
 }
 
+void OnJITStateChanged(uiCheckbox* cb, void* blarg)
+{
+    if (uiCheckboxChecked(cb))
+        uiControlEnable(uiControl(enJITMaxBlockSize));
+    else
+        uiControlDisable(uiControl(enJITMaxBlockSize));
+}
+
 void Open()
 {
     if (opened)
@@ -70,7 +80,7 @@ void Open()
     }
 
     opened = true;
-    win = uiNewWindow("Emu settings - melonDS", 300, 200, 0, 0, 0);
+    win = uiNewWindow("Emu settings - melonDS", 300, 170, 0, 0, 0);
     uiWindowSetMargined(win, 1);
     uiWindowOnClosing(win, OnCloseWindow, NULL);
 
@@ -79,12 +89,41 @@ void Open()
 
     {
         uiBox* in_ctrl = uiNewVerticalBox();
-        uiBoxAppend(top, uiControl(in_ctrl), 1);
+        uiBoxAppend(top, uiControl(in_ctrl), 0);
 
         cbDirectBoot = uiNewCheckbox("Boot game directly");
         uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0);
     }
 
+    {
+        uiLabel* dummy = uiNewLabel("");
+        uiBoxAppend(top, uiControl(dummy), 0);
+    }
+
+    {
+        uiGroup* grp = uiNewGroup("JIT");
+        uiBoxAppend(top, uiControl(grp), 1);
+
+        uiBox* in_ctrl = uiNewVerticalBox();
+        uiGroupSetChild(grp, uiControl(in_ctrl));
+
+        cbJITEnabled = uiNewCheckbox("Enable JIT recompiler");
+        uiBoxAppend(in_ctrl, uiControl(cbJITEnabled), 0);
+
+        uiCheckboxOnToggled(cbJITEnabled, OnJITStateChanged, NULL);
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            uiLabel* lbl = uiNewLabel("Maximum block size (1-32): ");
+            uiBoxAppend(row, uiControl(lbl), 0);
+
+            enJITMaxBlockSize = uiNewEntry();
+            uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
+        }
+    }
+
     {
         uiBox* in_ctrl = uiNewHorizontalBox();
         uiBoxSetPadded(in_ctrl, 1);
@@ -104,6 +143,8 @@ void Open()
 
     uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
 
+    OnJITStateChanged(cbJITEnabled, NULL);
+
     uiControlShow(uiControl(win));
 }
 
-- 
cgit v1.2.3


From ebce9f035ff05b414f1bb895beabb62bc539ac76 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 25 Jun 2019 17:09:27 +0200
Subject: JIT: implemented most ALU instructions

---
 src/ARM.cpp                        |  18 +-
 src/ARMJIT.cpp                     |  16 +-
 src/ARMJIT.h                       |  25 +-
 src/ARMJIT_RegCache.h              | 136 +++++++++
 src/ARMJIT_x64/ARMJIT_ALU.cpp      | 546 +++++++++++++++++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 245 ++++++++---------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  60 +++-
 src/CMakeLists.txt                 |   1 +
 8 files changed, 881 insertions(+), 166 deletions(-)
 create mode 100644 src/ARMJIT_RegCache.h
 create mode 100644 src/ARMJIT_x64/ARMJIT_ALU.cpp

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index b709277..420257a 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -517,10 +517,10 @@ void ARMv5::Execute()
                 AddCycles_C();
         }*/
 
-        if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);
+        /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);*/
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4));
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
             block = ARMJIT::CompileBlock(this);
         Cycles += block();
@@ -572,7 +572,7 @@ void ARMv4::Execute()
 
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
-        if (CPSR & 0x20) // THUMB
+        /*if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -600,7 +600,15 @@ void ARMv4::Execute()
             }
             else
                 AddCycles_C();
-        }
+        }*/
+
+        /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);*/
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
+        if (block == NULL)
+            block = ARMJIT::CompileBlock(this);
+        Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 489cdcf..74e154b 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,5 +1,7 @@
 #include "ARMJIT.h"
 
+#include <string.h>
+
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
 namespace ARMJIT
@@ -8,7 +10,6 @@ namespace ARMJIT
 Compiler* compiler;
 BlockCache cache;
 
-
 #define DUP2(x) x, x
 
 static ptrdiff_t JIT_MEM[2][32] = {
@@ -174,4 +175,17 @@ CompiledBlock CompileBlock(ARM* cpu)
     return block;
 }
 
+void ResetBlocks()
+{
+	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
+	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
+	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
+	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
+	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
+	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
+	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
+	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
+	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index d718295..2ca29e8 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -3,8 +3,6 @@
 
 #include "types.h"
 
-#include <string.h>
-
 #include "ARM.h"
 #include "ARM_InstrInfo.h"
 
@@ -13,14 +11,6 @@ namespace ARMJIT
 
 typedef u32 (*CompiledBlock)();
 
-class RegCache
-{
-
-static const int NativeRegAllocOrder[];
-static const int NativeRegsCount;
-
-};
-
 struct FetchedInstr
 {
     u32 A_Reg(int pos) const
@@ -117,24 +107,13 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
 }
 
-inline void ResetBlocks()
-{
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
-}
-
 void Init();
 void DeInit();
 
 CompiledBlock CompileBlock(ARM* cpu);
 
+void ResetBlocks();
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
new file mode 100644
index 0000000..e18d50f
--- /dev/null
+++ b/src/ARMJIT_RegCache.h
@@ -0,0 +1,136 @@
+#ifndef ARMJIT_REGCACHE_H
+#define ARMJIT_REGCACHE_H
+
+#include "ARMJIT.h"
+
+// TODO: replace this in the future
+#include "dolphin/BitSet.h"
+
+#include <assert.h>
+
+namespace ARMJIT
+{
+
+template <typename T, typename Reg>
+class RegCache
+{
+public:
+    RegCache()
+    {}
+
+	RegCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
+    {
+        for (int i = 0; i < 16; i++)
+            Mapping[i] = (Reg)-1;
+    }
+
+    void UnloadRegister(int reg)
+    {
+        assert(Mapping[reg] != -1);
+
+        if (DirtyRegs & (1 << reg))
+            Compiler->UnloadReg(reg, Mapping[reg]);
+
+        DirtyRegs &= ~(1 << reg);
+        LoadedRegs &= ~(1 << reg);
+        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
+        Mapping[reg] = (Reg)-1;
+    }
+
+    void LoadRegister(int reg)
+    {
+        assert(Mapping[reg] == -1);
+        for (int i = 0; i < NativeRegsAvailable; i++)
+        {
+            Reg nativeReg = NativeRegAllocOrder[i];
+            if (!(NativeRegsUsed & (1 << nativeReg)))
+            {
+                Mapping[reg] = nativeReg;
+                NativeRegsUsed |= 1 << (int)nativeReg;
+                LoadedRegs |= 1 << reg;
+
+                Compiler->LoadReg(reg, nativeReg);
+
+                return;
+            }
+        }
+
+        assert("Welp!");
+    }
+
+    void Flush()
+    {
+        BitSet16 loadedSet(LoadedRegs);
+        for (int reg : loadedSet)
+            UnloadRegister(reg);
+    }
+
+	void Prepare(int i)
+    {
+        u16 futureNeeded = 0;
+        int ranking[16];
+        for (int j = 0; j < 16; j++)
+            ranking[j] = 0;
+        for (int j = i; j < InstrsCount; j++)
+        {
+            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
+            futureNeeded |= regsNeeded.m_val;
+            for (int reg : regsNeeded)
+                ranking[reg]++;
+        }
+
+        // we'll unload all registers which are never used again
+        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
+        for (int reg : neverNeededAgain)
+            UnloadRegister(reg);
+
+		FetchedInstr Instr = Instrs[i];
+        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
+        if (needToBeLoaded != BitSet16(0))
+        {
+            int neededCount = needToBeLoaded.Count();
+            BitSet16 loadedSet(LoadedRegs);
+            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
+            {
+                int leastReg = -1;
+                int rank = 1000;
+                for (int reg : loadedSet)
+                {
+                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
+                    {
+                        leastReg = reg;
+                        rank = ranking[reg];
+                    }
+                }
+
+                assert(leastReg != -1);
+                UnloadRegister(leastReg);
+
+                loadedSet.m_val = LoadedRegs;
+            }
+
+            for (int reg : needToBeLoaded)
+                LoadRegister(reg);
+        }
+        DirtyRegs |= Instr.Info.DstRegs;
+    }
+
+	static const Reg NativeRegAllocOrder[];
+	static const int NativeRegsAvailable;
+
+	Reg Mapping[16];
+	u32 NativeRegsUsed = 0;
+	u16 LoadedRegs = 0;
+	u16 DirtyRegs = 0;
+
+	T* Compiler;
+
+	FetchedInstr* Instrs;
+	int InstrsCount;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..d06c99c
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -0,0 +1,546 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+// uses RSCRATCH3
+void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&),
+    OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (rd == rn && !(opFlags & opInvertOp2))
+        (this->*op)(32, rd, op2);
+    else if (opFlags & opSymmetric && op2 == R(RSCRATCH))
+    {
+        if (opFlags & opInvertOp2)
+            NOT(32, op2);
+        (this->*op)(32, op2, rn);
+        MOV(32, rd, op2);
+    }
+    else
+    {
+        if (opFlags & opInvertOp2)
+        {
+            if (op2 != R(RSCRATCH))
+            {
+                MOV(32, R(RSCRATCH), op2);
+                op2 = R(RSCRATCH);
+            }
+            NOT(32, op2);
+        }
+        MOV(32, R(RSCRATCH3), rn);
+        (this->*op)(32, R(RSCRATCH3), op2);
+        MOV(32, rd, R(RSCRATCH3));
+    }
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (op2 != R(RSCRATCH))
+    {
+        MOV(32, R(RSCRATCH), op2);
+        op2 = R(RSCRATCH);
+    }
+    (this->*op)(32, op2, rn);
+    MOV(32, rd, op2);
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
+{
+    switch (op)
+    {
+        case 0: // TST
+            if (rn.IsImm())
+            {
+                MOV(32, R(RSCRATCH3), rn);
+                rn = R(RSCRATCH3);
+            }
+            TEST(32, rn, op2);
+        break;
+        case 1: // TEQ
+            MOV(32, R(RSCRATCH3), rn);
+            XOR(32, R(RSCRATCH3), op2);
+        break;
+        case 2: // CMP
+            if (rn.IsImm())
+            {
+                MOV(32, R(RSCRATCH3), rn);
+                rn = R(RSCRATCH3);
+            }
+            CMP(32, rn, op2);
+        break;
+        case 3: // CMN
+            MOV(32, R(RSCRATCH3), rn);
+            ADD(32, R(RSCRATCH3), op2);
+        break;
+    }
+
+    Comp_RetriveFlags(op == 2, op >= 2, carryUsed);
+}
+
+// also calculates cycles
+OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
+{
+    if (CurrentInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        carryUsed = false;
+        return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E));
+    }
+    else
+    {
+        int op = (CurrentInstr.Instr >> 5) & 0x3;
+        if (CurrentInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+            OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+            if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15)
+                rm = Imm32(rm.Imm32() + 4);
+            return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+            return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F,
+                    MapReg(CurrentInstr.A_Reg(0)), S, carryUsed);
+        }
+    }
+}
+
+void Compiler::A_Comp_CmpOp()
+{
+    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed);
+
+    Comp_CmpOp(op - 0x8, rn, op2, carryUsed);
+}
+
+void Compiler::A_Comp_Arith()
+{
+    bool S = CurrentInstr.Instr & (1 << 20);
+    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed);
+
+    u32 sFlag = S ? opSetsFlags : 0;
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0x1: // EOR
+        Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0x2: // SUB
+        Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        return;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            if (rd != rn)
+                MOV(32, rd, rn);
+            NEG(32, rd);
+            if (S)
+                Comp_RetriveFlags(true, true, false);
+        }
+        else
+            Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        return;
+    case 0x4: // ADD
+        Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
+        return;
+    case 0x5: // ADC
+        Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
+        return;
+    case 0x6: // SBC
+        Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        return;
+    case 0x7: // RSC
+        Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
+        return;
+    case 0xC: // ORR
+        Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0xE: // BIC
+        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
+        return;
+    default:
+        assert("unimplemented");
+    }
+}
+
+void Compiler::A_Comp_MovOp()
+{
+    bool carryUsed;
+    bool S = CurrentInstr.Instr & (1 << 20);
+    OpArg op2 = A_Comp_GetALUOp2(S, carryUsed);
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+
+    if (rd != op2)
+        MOV(32, rd, op2);
+
+    if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF)
+        NOT(32, rd);
+
+    if (S)
+    {
+        TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, carryUsed);
+    }
+}
+
+void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
+{
+    CPSRDirty = true;
+
+    bool carryOnly = !retriveCV && carryUsed;
+    if (retriveCV)
+    {
+        SETcc(CC_O, R(RSCRATCH));
+        SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3));
+        LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
+    }
+
+    if (carryUsed == 983298)
+        printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr);
+
+    SETcc(CC_S, R(RSCRATCH));
+    SETcc(CC_Z, R(RSCRATCH3));
+    LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
+    int shiftAmount = 30;
+    if (retriveCV || carryUsed)
+    {
+        LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
+        shiftAmount = carryOnly ? 29 : 28;
+    }
+    SHL(32, R(RSCRATCH), Imm8(shiftAmount));
+
+    AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+    OR(32, R(RCPSR), R(RSCRATCH));
+}
+
+// always uses RSCRATCH, RSCRATCH2 only if S == true
+OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = S;
+
+    if (S)
+    {
+        XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+        BT(32, R(RCPSR), Imm8(29));
+        SETcc(CC_C, R(RSCRATCH2));
+    }
+
+    MOV(32, R(RSCRATCH), rm);
+    static_assert(RSCRATCH3 == ECX);
+    MOV(32, R(ECX), rs);
+    AND(32, R(ECX), Imm32(0xFF));
+
+    FixupBranch zero = J_CC(CC_Z);
+    if (op < 3)
+    {
+        void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL;
+        if (op == 0)
+            shiftOp = SHL;
+        else if (op == 1)
+            shiftOp = SHR;
+        else if (op == 2)
+            shiftOp = SAR;
+
+        CMP(32, R(ECX), Imm8(32));
+        FixupBranch lt32 = J_CC(CC_L);
+        FixupBranch done1;
+        if (op < 2)
+        {
+            FixupBranch eq32 = J_CC(CC_E);
+            XOR(32, R(RSCRATCH), R(RSCRATCH));
+            if (S)
+                XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+            done1 = J();
+            SetJumpTarget(eq32);
+        }
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(31));
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(1));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        FixupBranch done2 = J();
+
+        SetJumpTarget(lt32);
+        (this->*shiftOp)(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        if (op < 2)
+            SetJumpTarget(done1);
+        SetJumpTarget(done2);
+
+    }
+    else if (op == 3)
+    {
+        if (S)
+            BT(32, R(RSCRATCH), Imm8(31));
+        ROR_(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+    }
+    SetJumpTarget(zero);
+
+    return R(RSCRATCH);
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+        case 0: // LSL
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), rm);
+                SHL(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+
+                return R(RSCRATCH);
+            }
+            else
+            {
+                carryUsed = false;
+                return rm;
+            }
+        case 1: // LSR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), rm);
+                SHR(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+                return R(RSCRATCH);
+            }
+            else
+            {
+                if (S)
+                {
+                    MOV(32, R(RSCRATCH2), rm);
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                return Imm32(0);
+            }
+        case 2: // ASR
+            MOV(32, R(RSCRATCH), rm);
+            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            if (S)
+            {
+                if (amount == 0)
+                    BT(32, rm, Imm8(31));
+                SETcc(CC_C, R(RSCRATCH2));
+            }
+            return R(RSCRATCH);
+        case 3: // ROR
+            MOV(32, R(RSCRATCH), rm);
+            if (amount > 0)
+                ROR_(32, R(RSCRATCH), Imm8(amount));
+            else
+            {
+                BT(32, R(RCPSR), Imm8(29));
+                RCR(32, R(RSCRATCH), Imm8(1));
+            }
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+    }
+
+    assert(false);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    int op = (CurrentInstr.Instr >> 11) & 0x3;
+    int amount = (CurrentInstr.Instr >> 6) & 0x1F;
+
+    Comp_AddCycles_C();
+
+    bool carryUsed;
+    OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed);
+
+    if (shifted != rd)
+        MOV(32, rd, shifted);
+
+    TEST(32, rd, rd);
+    Comp_RetriveFlags(false, false, carryUsed);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    int op = (CurrentInstr.Instr >> 9) & 0x3;
+
+    OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6));
+    
+    Comp_AddCycles_C();
+
+    if (op & 1)
+        Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
+    else
+        Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
+}
+
+void Compiler::T_Comp_ALU_Imm8()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(8));
+
+    u32 op = (CurrentInstr.Instr >> 11) & 0x3;
+    OpArg imm = Imm32(CurrentInstr.Instr & 0xFF);
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+        case 0x0:
+            MOV(32, rd, imm);
+            TEST(32, rd, rd);
+            Comp_RetriveFlags(false, false, false);
+            return;
+        case 0x1:
+            Comp_CmpOp(2, rd, imm, false);
+            return;
+        case 0x2:
+            Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+            return;
+        case 0x3:
+            Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+            return;
+    }
+}
+
+void Compiler::T_Comp_ALU()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    u32 op = (CurrentInstr.Instr >> 6) & 0xF;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x1: // EOR
+        Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {
+            int shiftOp = op == 7 ? 3 : op - 0x2;
+            bool carryUsed;
+            OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
+            TEST(32, shifted, shifted);
+            MOV(32, rd, shifted);
+            Comp_RetriveFlags(false, false, true);
+        }
+        return;
+    case 0x5: // ADC
+        Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
+        return;
+    case 0x6: // SBC
+        Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
+        return;
+    case 0x8: // TST
+        Comp_CmpOp(0, rd, rs, false);
+        return;
+    case 0x9: // NEG
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NEG(32, rd);
+        Comp_RetriveFlags(true, true, false);
+        return;
+    case 0xA: // CMP
+        Comp_CmpOp(2, rd, rs, false);
+        return;
+    case 0xB: // CMN
+        Comp_CmpOp(3, rd, rs, false);
+        return;
+    case 0xC: // ORR
+        Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0xE: // BIC
+        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
+        return;
+    case 0xF: // MVN
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NOT(32, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    default:
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8)));
+    OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurrentInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+        case 0x0: // ADD
+            Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV);
+            return;
+        case 0x1: // CMP
+            Comp_CmpOp(2, rd, rs, false);
+            return;
+        case 0x2: // MOV
+            if (rd != rs)
+                MOV(32, rd, rs);
+            TEST(32, rd, rd);
+            Comp_RetriveFlags(false, false, false);
+            return;
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fb2fda8..f51d4d9 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -8,18 +8,16 @@ using namespace Gen;
 
 namespace ARMJIT
 {
-
-const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13};
-const int RegCache::NativeRegsCount = 5;
+template <>
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13};
+template <>
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 5;
 
 Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 4);
 }
 
-typedef void (Compiler::*CompileFunc)();
-typedef void (*InterpretFunc)(ARM*);
-
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -36,6 +34,19 @@ void Compiler::SaveCPSR()
     }
 }
 
+void Compiler::LoadReg(int reg, X64Reg nativeReg)
+{
+    if (reg != 15)
+        MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg])));
+    else
+        MOV(32, R(nativeReg), Imm32(R15));
+}
+
+void Compiler::UnloadReg(int reg, X64Reg nativeReg)
+{
+    MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -58,12 +69,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
+    // TODO: this is ugly as a whole, do better
+    RegCache = ARMJIT::RegCache<Compiler, X64Reg>(this, instrs, instrsCount);
+
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurrentInstr = instrs[i];
 
-        CompileFunc comp = NULL;
+        CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind);
+
+        if (CurrentInstr.Info.Branches())
+            comp = NULL;
 
         if (comp == NULL || i == instrsCount - 1)
         {
@@ -79,6 +96,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             SaveCPSR();
         }
 
+        if (comp != NULL)
+            RegCache.Prepare(i);
+        else
+            RegCache.Flush();
+
         if (Thumb)
         {
             if (comp == NULL)
@@ -89,8 +111,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
             else
-            {
-            }
+                (this->*comp)();
         }
         else
         {
@@ -101,7 +122,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
             }
             else if (cond == 0xF)
-                AddCycles_C();
+                Comp_AddCycles_C();
             else
             {
                 FixupBranch skipExecute;
@@ -115,17 +136,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                         MOV(32, R(RSCRATCH), Imm32(1));
                         SHL(32, R(RSCRATCH), R(RSCRATCH3));
                         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
-                    
+
                         skipExecute = J_CC(CC_Z);
                     }
                     else
                     {
                         // could have used a LUT, but then where would be the fun?
                         BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
-                        
+
                         skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
                     }
-                    
+
                 }
 
                 if (comp == NULL)
@@ -136,8 +157,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
                 }
                 else
-                {
-                }
+                    (this->*comp)();
 
                 FixupBranch skipFailed;
                 if (CurrentInstr.Cond() < 0xE)
@@ -145,7 +165,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     skipFailed = J();
                     SetJumpTarget(skipExecute);
 
-                    AddCycles_C();
+                    Comp_AddCycles_C();
 
                     SetJumpTarget(skipFailed);
                 }
@@ -155,13 +175,14 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         /*
             we don't need to collect the interpreted cycles,
             since all functions only add to it, the dispatcher
-            can take care of it.
+            takes care of it.
         */
 
         if (comp == NULL && i != instrsCount - 1)
             LoadCPSR();
     }
 
+    RegCache.Flush();
     SaveCPSR();
 
     LEA(32, RAX, MDisp(RCycles, ConstantCycles));
@@ -172,42 +193,57 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     return res;
 }
 
-void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
+CompileFunc Compiler::GetCompFunc(int kind)
 {
+    // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
+    // see ARMInstrInfo.h for the order
     const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     {
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // AND
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // EOR
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // SUB
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // RSB
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ADD
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ADC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // SBC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // RSC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ORR
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // MOV
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        // BIC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // MVN
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        // TST
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // TEQ
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // CMP
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // CMN
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -227,21 +263,34 @@ void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
     };
 
     const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
+        // Shift imm
+        T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
+        // Three operand ADD/SUB
+        T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
+        // 8 bit imm
+        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, 
+        // general ALU
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
+        // hi reg
+        T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
+        // pc/sp relative
+        NULL, NULL, NULL, 
+        // mem...
+        NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL
     };
+
+    return Thumb ? T_Comp[kind] : A_Comp[kind];
 }
 
-void Compiler::AddCycles_C()
+void Compiler::Comp_AddCycles_C()
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
@@ -253,80 +302,16 @@ void Compiler::AddCycles_C()
         ConstantCycles += cycles;
 }
 
-// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
-OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed)
-{
-    carryUsed = true;
-
-    switch (op)
-    {
-        case 0: // LSL
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                SHL(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-
-                return R(RSCRATCH);
-            }
-            else
-            {
-                carryUsed = false;
-                return R(rm);
-            }
-        case 1: // LSR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                SHR(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-                return R(RSCRATCH);
-            }
-            else
-            {
-                if (S)
-                {
-                    MOV(32, R(RSCRATCH2), R(rm));
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                return Imm32(0);
-            }
-        case 2: // ASR
-            MOV(32, R(RSCRATCH), R(rm));
-            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
-            if (S)
-            {
-                if (amount == 0)
-                {
-                    MOV(32, R(RSCRATCH2), R(rm));
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                else
-                    SETcc(CC_C, R(RSCRATCH2));
-            }
-            return R(RSCRATCH);
-        case 3: // ROR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                ROR_(32, R(RSCRATCH), Imm8(amount));
-            }
-            else
-            {
-                BT(32, R(RCPSR), Imm8(29));
-                MOV(32, R(RSCRATCH), R(rm));
-                RCR(32, R(RSCRATCH), Imm8(1));
-            }
-            if (S)
-                SETcc(CC_C, R(RSCRATCH2));
-            return R(RSCRATCH);
-    }
-}
-
-void Compiler::A_Comp_ALU(const FetchedInstr& instr)
+void Compiler::Comp_AddCycles_CI(u32 i)
 {
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i;
+    
+    if (CurrentInstr.Cond() < 0xE)
+        ADD(32, R(RCycles), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 8e1d100..9b454f4 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,7 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
-
+#include "../ARMJIT_RegCache.h"
 
 namespace ARMJIT
 {
@@ -17,6 +17,10 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+class Compiler;
+
+typedef void (Compiler::*CompileFunc)();
+
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -24,24 +28,66 @@ public:
 
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
-    void StartBlock(ARM* cpu);
-    CompiledBlock FinaliseBlock();
+    void LoadReg(int reg, Gen::X64Reg nativeReg);
+    void UnloadReg(int reg, Gen::X64Reg nativeReg);
 
-    void Compile(RegCache& regs, const FetchedInstr& instr);
 private:
-    void AddCycles_C();
+    CompileFunc GetCompFunc(int kind);
+
+    void Comp_AddCycles_C();
+    void Comp_AddCycles_CI(u32 i);
+
+    enum
+    {
+        opSetsFlags = 1 << 0,
+        opSymmetric = 1 << 1,
+        opRetriveCV = 1 << 2,
+        opInvertCarry = 1 << 3,
+        opSyncCarry = 1 << 4,
+        opInvertOp2 = 1 << 5,
+    };
+
+    void A_Comp_Arith();
+    void A_Comp_MovOp();
+    void A_Comp_CmpOp();
 
-    Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed);
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALU_Imm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
 
-    void A_Comp_ALU(const FetchedInstr& instr);
+    void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed);
+
+    void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
+
+    Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
+    Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
+
+    Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
 
     void LoadCPSR();
     void SaveCPSR();
 
+    Gen::OpArg MapReg(int reg)
+    {
+        if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
+            return Gen::Imm32(R15);
+
+        assert(RegCache.Mapping[reg] != Gen::INVALID_REG);
+        return Gen::R(RegCache.Mapping[reg]);
+    }
+
     bool CPSRDirty = false;
 
     FetchedInstr CurrentInstr;
 
+    RegCache<Compiler, Gen::X64Reg> RegCache;
+
     bool Thumb;
     u32 Num;
     u32 R15;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 87200ad..d88638a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -33,6 +33,7 @@ add_library(core STATIC
 
 	ARMJIT.cpp
 	ARMJIT_x64/ARMJIT_Compiler.cpp
+	ARMJIT_x64/ARMJIT_ALU.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
-- 
cgit v1.2.3


From ff901141e77ad6c8d2910d77bef2b7c5674fcc7f Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 25 Jun 2019 18:28:01 +0200
Subject: jit: correct cycle counting for thumb shift by reg

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 7 +++++--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 0
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_LoadStore.cpp

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index d06c99c..dc82af7 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -456,7 +456,10 @@ void Compiler::T_Comp_ALU()
 
     u32 op = (CurrentInstr.Instr >> 6) & 0xF;
 
-    Comp_AddCycles_C();
+    if ((op >= 0x2 && op < 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1);
+    else
+        Comp_AddCycles_C();
 
     switch (op)
     {
@@ -471,7 +474,7 @@ void Compiler::T_Comp_ALU()
     case 0x4:
     case 0x7:
         {
-            int shiftOp = op == 7 ? 3 : op - 0x2;
+            int shiftOp = op == 0x7 ? 3 : op - 0x2;
             bool carryUsed;
             OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
             TEST(32, shifted, shifted);
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..e69de29
-- 
cgit v1.2.3


From 5f932cdf48681414465512fb47d619ad73414137 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 30 Jun 2019 13:35:03 +0200
Subject: JIT: compilation of word load and store

---
 src/ARMJIT.cpp                      |   4 +-
 src/ARMJIT.h                        |   3 +-
 src/ARMJIT_RegCache.h               |   2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |   4 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 111 +++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  19 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 600 ++++++++++++++++++++++++++++++++++++
 src/ARM_InstrInfo.h                 |   8 +-
 src/CMakeLists.txt                  |   1 +
 src/dolphin/x64ABI.h                |   3 +-
 10 files changed, 712 insertions(+), 43 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 74e154b..4da781c 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -40,8 +40,7 @@ static ptrdiff_t JIT_MEM[2][32] = {
 		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
 		/* 3X*/	     offsetof(BlockCache, SWRAM),
 		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	     -1,
-		             offsetof(BlockCache, ARM7_WIRAM),
+		/* 4X*/	DUP2(-1),
 		/* 5X*/	DUP2(-1),
 		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
 														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
@@ -183,7 +182,6 @@ void ResetBlocks()
 	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
 	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
 	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
 	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
 	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 2ca29e8..45bb4ed 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -63,14 +63,13 @@ struct BlockCache
 {
     CompiledBlock* AddrMapping[2][0x4000] = {0};
 
-    CompiledBlock MainRAM[16*1024*1024/2];
+    CompiledBlock MainRAM[4*1024*1024/2];
 	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
 	CompiledBlock ARM9_ITCM[0x8000/2];
 	CompiledBlock ARM9_LCDC[0xA4000/2];
 	CompiledBlock ARM9_BIOS[0x8000/2];
 	CompiledBlock ARM7_BIOS[0x4000/2];
 	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi
 	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
 };
 
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
index e18d50f..ea9fb30 100644
--- a/src/ARMJIT_RegCache.h
+++ b/src/ARMJIT_RegCache.h
@@ -30,7 +30,7 @@ public:
         assert(Mapping[reg] != -1);
 
         if (DirtyRegs & (1 << reg))
-            Compiler->UnloadReg(reg, Mapping[reg]);
+            Compiler->SaveReg(reg, Mapping[reg]);
 
         DirtyRegs &= ~(1 << reg);
         LoadedRegs &= ~(1 << reg);
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index dc82af7..6294e1d 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -255,8 +255,8 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     if (S)
     {
         XOR(32, R(RSCRATCH2), R(RSCRATCH2));
-        BT(32, R(RCPSR), Imm8(29));
-        SETcc(CC_C, R(RSCRATCH2));
+        TEST(32, R(RCPSR), Imm32(1 << 29));
+        SETcc(CC_NZ, R(RSCRATCH2));
     }
 
     MOV(32, R(RSCRATCH), rm);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index f51d4d9..9096397 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,13 +9,43 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13};
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = 
+{
+#ifdef _WIN32
+    RBX, RSI, RDI, R12, R13
+#else
+    RBX, R12, R13
+#endif
+};
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 5;
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 
+#ifdef _WIN32
+    5
+#else
+    3
+#endif
+;
 
 Compiler::Compiler()
 {
-    AllocCodeSpace(1024 * 1024 * 4);
+    AllocCodeSpace(1024 * 1024 * 16);
+
+    for (int i = 0; i < 15; i++)
+    {
+        ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i);
+        WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i);
+        for (int j = 0; j < 2; j++)
+        {
+            ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i);
+            WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i);
+        }
+    }
+    ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000);
+    WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000);
+    ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000);
+    WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000);
+
+    ResetStart = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -42,7 +72,7 @@ void Compiler::LoadReg(int reg, X64Reg nativeReg)
         MOV(32, R(nativeReg), Imm32(R15));
 }
 
-void Compiler::UnloadReg(int reg, X64Reg nativeReg)
+void Compiler::SaveReg(int reg, X64Reg nativeReg)
 {
     MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
 }
@@ -52,7 +82,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     if (IsAlmostFull())
     {
         ResetBlocks();
-        ResetCodePtr();
+        SetCodePtr((u8*)ResetStart);
     }
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
@@ -61,8 +91,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
     R15 = cpu->R[15];
+    CodeRegion = cpu->CodeRegion;
 
-    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
     XOR(32, R(RCycles), R(RCycles));
@@ -142,9 +173,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     else
                     {
                         // could have used a LUT, but then where would be the fun?
-                        BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
+                        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-                        skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
+                        skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z);
                     }
 
                 }
@@ -187,7 +218,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LEA(32, RAX, MDisp(RCycles, ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
     RET();
 
     return res;
@@ -243,23 +274,38 @@ CompileFunc Compiler::GetCompFunc(int kind)
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // Mul
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // ARMv5 stuff
+        NULL, NULL, NULL, NULL, NULL, 
+        // STR
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        // STRB
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDR
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        // LDRB
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // STRH
+        NULL, NULL, NULL, NULL, 
+        // LDRD
+        NULL, NULL, NULL, NULL,
+        // STRD
+        NULL, NULL, NULL, NULL,
+        // LDRH
+        NULL, NULL, NULL, NULL, 
+        // LDRSB
         NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDRSH
+        NULL, NULL, NULL, NULL, 
+        // swap
+        NULL, NULL, 
+        // LDM/STM
+        NULL, NULL,
+        // Branch
+        NULL, NULL, NULL, NULL, NULL,
+        // system stuff
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
 
     const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
@@ -278,10 +324,17 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
         NULL, NULL, NULL, 
-        // mem...
-        NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDR pcrel
+        NULL, 
+        // LDR/STR reg offset
+        T_Comp_MemReg, NULL, T_Comp_MemReg, NULL,
+        // LDR/STR sign extended, half 
+        NULL, NULL, NULL, NULL,
+        // LDR/STR imm offset
+        T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, 
+        // LDR/STR half imm offset
+        NULL, NULL,
+        // branch, etc.
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9b454f4..7ab9b25 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -29,7 +29,7 @@ public:
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
-    void UnloadReg(int reg, Gen::X64Reg nativeReg);
+    void SaveReg(int reg, Gen::X64Reg nativeReg);
 
 private:
     CompileFunc GetCompFunc(int kind);
@@ -51,12 +51,17 @@ private:
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
+    void A_Comp_MemWB();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
 
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
     void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
@@ -65,10 +70,14 @@ private:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void* Gen_MemoryRoutine9(bool store, int size, u32 region);
+    void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region);
+
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
+    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
     void SaveCPSR();
@@ -82,6 +91,8 @@ private:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    void* ResetStart;
+
     bool CPSRDirty = false;
 
     FetchedInstr CurrentInstr;
@@ -91,10 +102,16 @@ private:
     bool Thumb;
     u32 Num;
     u32 R15;
+    u32 CodeRegion;
 
     u32 ConstantCycles;
 };
 
+extern void* ReadMemFuncs9[16];
+extern void* ReadMemFuncs7[2][16];
+extern void* WriteMemFuncs9[16];
+extern void* WriteMemFuncs7[2][16];
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index e69de29..d534269 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,600 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../GPU.h"
+#include "../Wifi.h"
+
+namespace NDS
+{
+#define MAIN_RAM_SIZE 0x400000
+extern u8* SWRAM_ARM9;
+extern u32 SWRAM_ARM9Mask;
+extern u8* SWRAM_ARM7;
+extern u32 SWRAM_ARM7Mask;
+extern u8 ARM7WRAM[];
+extern u16 ARM7BIOSProt;
+}
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+void* ReadMemFuncs9[16];
+void* ReadMemFuncs7[2][16];
+void* WriteMemFuncs9[16];
+void* WriteMemFuncs7[2][16];
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
+
+u32 ReadVRAM9(u32 addr)
+{
+    switch (addr & 0x00E00000)
+    {
+        case 0x00000000: return GPU::ReadVRAM_ABG<u32>(addr);
+        case 0x00200000: return GPU::ReadVRAM_BBG<u32>(addr);
+        case 0x00400000: return GPU::ReadVRAM_AOBJ<u32>(addr);
+        case 0x00600000: return GPU::ReadVRAM_BOBJ<u32>(addr);
+        default:         return GPU::ReadVRAM_LCDC<u32>(addr);
+    }
+}
+
+void WriteVRAM9(u32 addr, u32 val)
+{
+    switch (addr & 0x00E00000)
+    {
+        case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
+        case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
+        case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
+        case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
+        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
+    }
+}
+
+/*
+    R11 - data to write (store only)
+    RSCRATCH2 - address
+    RSCRATCH3 - code cycles
+*/
+void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region)
+{
+    AlignCode4();
+    void* res = (void*)GetWritableCodePtr();
+
+    if (!store)
+    {
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        AND(32, R(RSCRATCH), Imm8(0x3));
+        SHL(32, R(RSCRATCH), Imm8(3));
+        // enter the shadow realm!
+        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
+    }
+
+    // cycle counting!
+    // this is AddCycles_CDI
+    MOV(32, R(R10), R(RSCRATCH2));
+    SHR(32, R(R10), Imm8(12));
+    MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2));
+    LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6));
+    CMP(32, R(R10), R(RSCRATCH3));
+    CMOVcc(32, RSCRATCH3, R(R10), CC_G);
+    CMP(32, R(RSCRATCH), R(RSCRATCH3));
+    CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G);
+    ADD(32, R(RCycles), R(RSCRATCH3));
+
+    if (!store)
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+    AND(32, R(RSCRATCH2), Imm32(~3));
+
+    {
+        MOV(32, R(RSCRATCH3), R(RSCRATCH2));
+        SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+        CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+        FixupBranch outsideDTCM = J_CC(CC_AE);
+        AND(32, R(RSCRATCH2), Imm32(0x3FFF));
+        if (!store)
+        {
+            MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)));
+            MOV(32, R(ECX), MDisp(RSP, 8));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
+        else
+            MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11));
+        RET();
+        SetJumpTarget(outsideDTCM);
+        MOV(32, R(RSCRATCH2), R(RSCRATCH3));
+    }
+
+    switch (region)
+    {
+    case 0x00000000:
+    case 0x01000000:
+        {
+            CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+            FixupBranch insideITCM = J_CC(CC_B);
+            RET();
+            SetJumpTarget(insideITCM);
+            AND(32, R(RSCRATCH2), Imm32(0x7FFF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)));
+            else
+            {
+                MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0));
+            }
+        }
+        break;
+    case 0x02000000:
+        AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
+        if (!store)
+            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
+        else
+        {
+            MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
+            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
+            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
+        }
+        break;
+    case 0x03000000:
+        {
+            MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9));
+            TEST(64, R(RSCRATCH3), R(RSCRATCH3));
+            FixupBranch notMapped = J_CC(CC_Z);
+            AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask));
+            if (!store)
+                MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3));
+            else
+            {
+                MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
+            }
+            SetJumpTarget(notMapped);
+        }
+        break;
+    case 0x04000000:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        {
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            ABI_CallFunction(NDS::ARM9IORead32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)NDS::ARM9IOWrite32, true);
+        }
+        break;
+    case 0x05000000:
+        {        
+            MOV(32, R(RSCRATCH), Imm32(1<<1));
+            MOV(32, R(RSCRATCH3), Imm32(1<<9));
+            TEST(32, R(RSCRATCH2), Imm32(0x400));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
+            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
+            FixupBranch available = J_CC(CC_NZ);
+            RET();
+            SetJumpTarget(available);
+            AND(32, R(RSCRATCH2), Imm32(0x7FF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette)));
+            else
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11));
+        }
+        break;
+    case 0x06000000:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        { 
+            ABI_PushRegistersAndAdjustStack({}, 8);
+            ABI_CallFunction(ReadVRAM9);
+            ABI_PopRegistersAndAdjustStack({}, 8);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)WriteVRAM9, true);
+        }
+        break;
+    case 0x07000000:
+        {
+            MOV(32, R(RSCRATCH), Imm32(1<<1));
+            MOV(32, R(RSCRATCH3), Imm32(1<<9));
+            TEST(32, R(RSCRATCH2), Imm32(0x400));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
+            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
+            FixupBranch available = J_CC(CC_NZ);
+            RET();
+            SetJumpTarget(available);
+            AND(32, R(RSCRATCH2), Imm32(0x7FF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM)));
+            else
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11));
+        }
+        break;
+    case 0x08000000:
+    case 0x09000000:
+    case 0x0A000000:
+        if (!store)
+            MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+        break;
+    case 0xFF000000:
+        if (!store)
+        {
+            AND(32, R(RSCRATCH2), Imm32(0xFFF));
+            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS)));
+        }
+        break;
+    default:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        {
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            ABI_CallFunction(NDS::ARM9Read32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)NDS::ARM9Write32, true);
+        }
+        break;
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ECX), MDisp(RSP, 8));
+        ROR_(32, R(RSCRATCH), R(ECX));
+    }
+
+    RET();
+
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region)
+{
+    AlignCode4();
+    void* res = GetWritableCodePtr();
+
+    if (!store)
+    {
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        AND(32, R(RSCRATCH), Imm8(0x3));
+        SHL(32, R(RSCRATCH), Imm8(3));
+        // enter the shadow realm!
+        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
+    }
+
+    // AddCycles_CDI
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(15));
+    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2)));
+    if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode))
+    {
+        if (!store && region != 0x02000000)
+            LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1));
+        ADD(32, R(RCycles), R(RSCRATCH3));
+    }
+    else
+    {
+        if (!store)
+            ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1));
+        LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3));
+        CMP(32, R(RSCRATCH3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G);
+        CMP(32, R(R10), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(R10), CC_G);
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+
+    if (!store)
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+    AND(32, R(RSCRATCH2), Imm32(~3));
+
+    switch (region)
+    {
+        case 0x00000000:
+            if (!store) {
+                CMP(32, R(RSCRATCH2), Imm32(0x4000));
+                FixupBranch outsideBIOS1 = J_CC(CC_AE);
+
+                MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15])));
+                CMP(32, R(RSCRATCH), Imm32(0x4000));
+                FixupBranch outsideBIOS2 = J_CC(CC_AE);
+                MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt));
+                CMP(32, R(RSCRATCH2), R(RSCRATCH3));
+                FixupBranch notDenied1 = J_CC(CC_AE);
+                CMP(32, R(RSCRATCH), R(RSCRATCH3));
+                FixupBranch notDenied2 = J_CC(CC_B);
+                SetJumpTarget(outsideBIOS2);
+                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+                RET();
+
+                SetJumpTarget(notDenied1);
+                SetJumpTarget(notDenied2);
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS)));
+                MOV(32, R(ECX), MDisp(RSP, 8));
+                ROR_(32, R(RSCRATCH), R(ECX));
+                RET();
+
+                SetJumpTarget(outsideBIOS1);
+            }
+            break;
+        case 0x02000000:
+            AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
+            else
+            {
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
+            }
+            break;
+        case 0x03000000:
+            {
+                TEST(32, R(RSCRATCH2), Imm32(0x800000));
+                FixupBranch region = J_CC(CC_NZ);
+                MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7));
+                TEST(64, R(RSCRATCH), R(RSCRATCH));
+                FixupBranch notMapped = J_CC(CC_Z);
+                AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask));
+                if (!store)
+                {
+                    MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2));
+                    MOV(32, R(ECX), MDisp(RSP, 8));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else
+                {
+                    MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
+                }
+                RET();
+                SetJumpTarget(region);
+                SetJumpTarget(notMapped);
+                AND(32, R(RSCRATCH2), Imm32(0xFFFF));
+                if (!store)
+                    MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)));
+                else
+                {
+                    MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0));
+                }
+            }
+            break;
+        case 0x04000000:
+            {
+                TEST(32, R(RSCRATCH2), Imm32(0x800000));
+                FixupBranch region = J_CC(CC_NZ);
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                if (!store)
+                {
+                    ABI_PushRegistersAndAdjustStack({}, 8);
+                    ABI_CallFunction(NDS::ARM7IORead32);
+                    ABI_PopRegistersAndAdjustStack({}, 8);
+
+                    MOV(32, R(ECX), MDisp(RSP, 8));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                    RET();
+                }
+                else
+                {
+                    MOV(32, R(ABI_PARAM2), R(R11));
+                    JMP((u8*)NDS::ARM7IOWrite32, true);
+                }
+                SetJumpTarget(region);
+
+                if (!store)
+                {
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    ABI_CallFunction(Wifi::Read);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8);
+
+                    ADD(32, R(RSCRATCH2), Imm8(2));
+                    ABI_PushRegistersAndAdjustStack({EAX}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    ABI_CallFunction(Wifi::Read);
+                    MOV(32, R(RSCRATCH2), R(EAX));
+                    SHL(32, R(RSCRATCH2), Imm8(16));
+                    ABI_PopRegistersAndAdjustStack({EAX}, 8);
+                    OR(32, R(EAX), R(RSCRATCH2));
+                }
+                else
+                {
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    MOVZX(32, 16, ABI_PARAM2, R(R11));
+                    ABI_CallFunction(Wifi::Write);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    SHR(32, R(R11), Imm8(16));
+                    ADD(32, R(RSCRATCH2), Imm8(2));
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    MOVZX(32, 16, ABI_PARAM2, R(R11));
+                    ABI_CallFunction(Wifi::Write);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                }
+            }
+            break;
+        case 0x06000000:
+            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+            if (!store)
+            {
+                ABI_PushRegistersAndAdjustStack({}, 8);
+                ABI_CallFunction(GPU::ReadVRAM_ARM7<u32>);
+                ABI_PopRegistersAndAdjustStack({}, 8);
+            }
+            else
+            {
+                AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1));
+                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0));
+                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0));
+                MOV(32, R(ABI_PARAM2), R(R11));
+                JMP((u8*)GPU::WriteVRAM_ARM7<u32>, true);
+            }
+            break;
+        case 0x08000000:
+        case 0x09000000:
+        case 0x0A000000:
+            if (!store)
+                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+            break;
+        /*default:
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+            ABI_CallFunction(NDS::ARM7Read32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+            break;*/
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ECX), MDisp(RSP, 8));
+        ROR_(32, R(RSCRATCH), R(ECX));
+    }
+
+    RET();
+
+    return res;
+}
+
+OpArg Compiler::A_Comp_GetMemWBOffset()
+{
+    if (!(CurrentInstr.Instr & (1 << 25)))
+        return Imm32(CurrentInstr.Instr & 0xFFF);
+    else
+    {
+        int op = (CurrentInstr.Instr >> 5) & 0x3;
+        int amount = (CurrentInstr.Instr >> 7) & 0x1F;
+        OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+        bool carryUsed;
+        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
+    }
+}
+
+void Compiler::A_Comp_MemWB()
+{    
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    bool load = CurrentInstr.Instr & (1 << 20);
+
+    MOV(32, R(RSCRATCH2), rn);
+    if (CurrentInstr.Instr & (1 << 24))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+        if (CurrentInstr.Instr & (1 << 23))
+            ADD(32, R(RSCRATCH2), offset);
+        else
+            SUB(32, R(RSCRATCH2), offset);
+
+        if (CurrentInstr.Instr & (1 << 21))
+            MOV(32, rn, R(RSCRATCH2));
+    }
+
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles;
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, R(RSCRATCH2), R(RSCRATCH));
+
+    if (!(CurrentInstr.Instr & (1 << 24)))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+
+        if (CurrentInstr.Instr & (1 << 23))
+            ADD(32, rn, offset);
+        else
+            SUB(32, rn, offset);
+    }
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH2));
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
+    OpArg ro = MapReg(CurrentInstr.T_Reg(6));
+
+    int op = (CurrentInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    
+    MOV(32, R(RSCRATCH2), rb);
+    ADD(32, R(RSCRATCH2), ro);
+
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH));
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    // TODO: aufräumen!!!
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
+    
+    int op = (CurrentInstr.Instr >> 11) & 0x3;
+    u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4;
+    bool load = op & 0x1;
+
+    LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset));
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH));
+}
+
+}
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index e717664..dcd938b 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -83,10 +83,10 @@ enum
     ak_ALU(BIC),
     ak_ALU(MVN),
 
-    ak_ALU(TST),
-    ak_ALU(TEQ),
-    ak_ALU(CMP),
-    ak_ALU(CMN),
+    ak_Test(TST),
+    ak_Test(TEQ),
+    ak_Test(CMP),
+    ak_Test(CMN),
 
     ak_MUL,
     ak_MLA,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d88638a..662ed5c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -34,6 +34,7 @@ add_library(core STATIC
 	ARMJIT.cpp
 	ARMJIT_x64/ARMJIT_Compiler.cpp
 	ARMJIT_x64/ARMJIT_ALU.cpp
+	ARMJIT_x64/ARMJIT_LoadStore.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
index 997782e..94336d0 100644
--- a/src/dolphin/x64ABI.h
+++ b/src/dolphin/x64ABI.h
@@ -37,7 +37,8 @@
 
 // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
 #define ABI_ALL_CALLER_SAVED                                                                       \
-  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11})
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16,           \
+      XMM4 + 16, XMM5 + 16})
 #else  // 64-bit Unix / OS X
 
 #define ABI_PARAM1 RDI
-- 
cgit v1.2.3


From 2c44bf927c230efbbd1b27920de062ddcc631fcf Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 6 Jul 2019 01:48:42 +0200
Subject: JIT: most mem instructions working + branching

---
 src/ARM.cpp                         |  10 +-
 src/ARMJIT.cpp                      |   7 +-
 src/ARMJIT.h                        |   2 +-
 src/ARMJIT_RegCache.h               |   2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 322 ++++++++-------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 145 ++++---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  42 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 805 +++++++++++++++---------------------
 src/ARM_InstrInfo.cpp               |   2 +-
 src/NDS.cpp                         |   2 +
 10 files changed, 653 insertions(+), 686 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 420257a..f7ca26d 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -522,8 +522,9 @@ void ARMv5::Execute()
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
-            block = ARMJIT::CompileBlock(this);
-        Cycles += block();
+            ARMJIT::CompileBlock(this);
+        else
+            Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
@@ -607,8 +608,9 @@ void ARMv4::Execute()
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
-            block = ARMJIT::CompileBlock(this);
-        Cycles += block();
+            ARMJIT::CompileBlock(this);
+        else
+            Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 4da781c..6afa967 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -121,12 +121,13 @@ void DeInit()
 	delete compiler;
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
     FetchedInstr instrs[12];
     int i = 0;
+	u32 r15Initial = cpu->R[15];
     u32 r15 = cpu->R[15];
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
     //printf("block %x %d\n", r15, thumb);
@@ -169,9 +170,7 @@ CompiledBlock CompileBlock(ARM* cpu)
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block);
-
-    return block;
+    InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
 }
 
 void ResetBlocks()
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 45bb4ed..71188f9 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void CompileBlock(ARM* cpu);
 
 void ResetBlocks();
 
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
index ea9fb30..556d27b 100644
--- a/src/ARMJIT_RegCache.h
+++ b/src/ARMJIT_RegCache.h
@@ -114,7 +114,7 @@ public:
             for (int reg : needToBeLoaded)
                 LoadRegister(reg);
         }
-        DirtyRegs |= Instr.Info.DstRegs;
+        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 6294e1d..c22751e 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -71,30 +71,30 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
 {
     switch (op)
     {
-        case 0: // TST
-            if (rn.IsImm())
-            {
-                MOV(32, R(RSCRATCH3), rn);
-                rn = R(RSCRATCH3);
-            }
-            TEST(32, rn, op2);
-        break;
-        case 1: // TEQ
+    case 0: // TST
+        if (rn.IsImm())
+        {
             MOV(32, R(RSCRATCH3), rn);
-            XOR(32, R(RSCRATCH3), op2);
-        break;
-        case 2: // CMP
-            if (rn.IsImm())
-            {
-                MOV(32, R(RSCRATCH3), rn);
-                rn = R(RSCRATCH3);
-            }
-            CMP(32, rn, op2);
-        break;
-        case 3: // CMN
+            rn = R(RSCRATCH3);
+        }
+        TEST(32, rn, op2);
+    break;
+    case 1: // TEQ
+        MOV(32, R(RSCRATCH3), rn);
+        XOR(32, R(RSCRATCH3), op2);
+    break;
+    case 2: // CMP
+        if (rn.IsImm())
+        {
             MOV(32, R(RSCRATCH3), rn);
-            ADD(32, R(RSCRATCH3), op2);
-        break;
+            rn = R(RSCRATCH3);
+        }
+        CMP(32, rn, op2);
+    break;
+    case 3: // CMN
+        MOV(32, R(RSCRATCH3), rn);
+        ADD(32, R(RSCRATCH3), op2);
+    break;
     }
 
     Comp_RetriveFlags(op == 2, op >= 2, carryUsed);
@@ -103,38 +103,38 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
 // also calculates cycles
 OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
 {
-    if (CurrentInstr.Instr & (1 << 25))
+    if (CurInstr.Instr & (1 << 25))
     {
         Comp_AddCycles_C();
         carryUsed = false;
-        return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E));
+        return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));
     }
     else
     {
-        int op = (CurrentInstr.Instr >> 5) & 0x3;
-        if (CurrentInstr.Instr & (1 << 4))
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        if (CurInstr.Instr & (1 << 4))
         {
             Comp_AddCycles_CI(1);
-            OpArg rm = MapReg(CurrentInstr.A_Reg(0));
-            if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15)
+            OpArg rm = MapReg(CurInstr.A_Reg(0));
+            if (rm.IsImm() && CurInstr.A_Reg(0) == 15)
                 rm = Imm32(rm.Imm32() + 4);
-            return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed);
+            return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed);
         }
         else
         {
             Comp_AddCycles_C();
-            return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F,
-                    MapReg(CurrentInstr.A_Reg(0)), S, carryUsed);
+            return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F,
+                    MapReg(CurInstr.A_Reg(0)), S, carryUsed);
         }
     }
 }
 
 void Compiler::A_Comp_CmpOp()
 {
-    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
 
     bool carryUsed;
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
     OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed);
 
     Comp_CmpOp(op - 0x8, rn, op2, carryUsed);
@@ -142,12 +142,12 @@ void Compiler::A_Comp_CmpOp()
 
 void Compiler::A_Comp_Arith()
 {
-    bool S = CurrentInstr.Instr & (1 << 20);
-    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
 
     bool carryUsed;
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
     OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed);
 
     u32 sFlag = S ? opSetsFlags : 0;
@@ -155,13 +155,13 @@ void Compiler::A_Comp_Arith()
     {
     case 0x0: // AND
         Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0x1: // EOR
         Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0x2: // SUB
         Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
-        return;
+        break;
     case 0x3: // RSB
         if (op2.IsZero())
         {
@@ -173,41 +173,44 @@ void Compiler::A_Comp_Arith()
         }
         else
             Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
-        return;
+        break;
     case 0x4: // ADD
         Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
-        return;
+        break;
     case 0x5: // ADC
         Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
-        return;
+        break;
     case 0x6: // SBC
         Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
-        return;
+        break;
     case 0x7: // RSC
         Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
-        return;
+        break;
     case 0xC: // ORR
         Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0xE: // BIC
         Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
-        return;
+        break;
     default:
         assert("unimplemented");
     }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
 void Compiler::A_Comp_MovOp()
 {
     bool carryUsed;
-    bool S = CurrentInstr.Instr & (1 << 20);
+    bool S = CurInstr.Instr & (1 << 20);
     OpArg op2 = A_Comp_GetALUOp2(S, carryUsed);
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
 
     if (rd != op2)
         MOV(32, rd, op2);
 
-    if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF)
+    if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
         NOT(32, rd);
 
     if (S)
@@ -215,6 +218,9 @@ void Compiler::A_Comp_MovOp()
         TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, carryUsed);
     }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
@@ -230,7 +236,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
     }
 
     if (carryUsed == 983298)
-        printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr);
+        printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr);
 
     SETcc(CC_S, R(RSCRATCH));
     SETcc(CC_Z, R(RSCRATCH3));
@@ -324,61 +330,61 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car
 
     switch (op)
     {
-        case 0: // LSL
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), rm);
-                SHL(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-
-                return R(RSCRATCH);
-            }
-            else
-            {
-                carryUsed = false;
-                return rm;
-            }
-        case 1: // LSR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), rm);
-                SHR(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-                return R(RSCRATCH);
-            }
-            else
-            {
-                if (S)
-                {
-                    MOV(32, R(RSCRATCH2), rm);
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                return Imm32(0);
-            }
-        case 2: // ASR
+    case 0: // LSL
+        if (amount > 0)
+        {
             MOV(32, R(RSCRATCH), rm);
-            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            SHL(32, R(RSCRATCH), Imm8(amount));
             if (S)
-            {
-                if (amount == 0)
-                    BT(32, rm, Imm8(31));
                 SETcc(CC_C, R(RSCRATCH2));
-            }
+
             return R(RSCRATCH);
-        case 3: // ROR
+        }
+        else
+        {
+            carryUsed = false;
+            return rm;
+        }
+    case 1: // LSR
+        if (amount > 0)
+        {
             MOV(32, R(RSCRATCH), rm);
-            if (amount > 0)
-                ROR_(32, R(RSCRATCH), Imm8(amount));
-            else
-            {
-                BT(32, R(RCPSR), Imm8(29));
-                RCR(32, R(RSCRATCH), Imm8(1));
-            }
+            SHR(32, R(RSCRATCH), Imm8(amount));
             if (S)
                 SETcc(CC_C, R(RSCRATCH2));
             return R(RSCRATCH);
+        }
+        else
+        {
+            if (S)
+            {
+                MOV(32, R(RSCRATCH2), rm);
+                SHR(32, R(RSCRATCH2), Imm8(31));
+            }
+            return Imm32(0);
+        }
+    case 2: // ASR
+        MOV(32, R(RSCRATCH), rm);
+        SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+        if (S)
+        {
+            if (amount == 0)
+                BT(32, rm, Imm8(31));
+            SETcc(CC_C, R(RSCRATCH2));
+        }
+        return R(RSCRATCH);
+    case 3: // ROR
+        MOV(32, R(RSCRATCH), rm);
+        if (amount > 0)
+            ROR_(32, R(RSCRATCH), Imm8(amount));
+        else
+        {
+            BT(32, R(RCPSR), Imm8(29));
+            RCR(32, R(RSCRATCH), Imm8(1));
+        }
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+        return R(RSCRATCH);
     }
 
     assert(false);
@@ -386,11 +392,11 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car
 
 void Compiler::T_Comp_ShiftImm()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    int op = (CurrentInstr.Instr >> 11) & 0x3;
-    int amount = (CurrentInstr.Instr >> 6) & 0x1F;
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
 
     Comp_AddCycles_C();
 
@@ -406,12 +412,12 @@ void Compiler::T_Comp_ShiftImm()
 
 void Compiler::T_Comp_AddSub_()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    int op = (CurrentInstr.Instr >> 9) & 0x3;
+    int op = (CurInstr.Instr >> 9) & 0x3;
 
-    OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6));
+    OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6));
     
     Comp_AddCycles_C();
 
@@ -423,38 +429,38 @@ void Compiler::T_Comp_AddSub_()
 
 void Compiler::T_Comp_ALU_Imm8()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(8));
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
 
-    u32 op = (CurrentInstr.Instr >> 11) & 0x3;
-    OpArg imm = Imm32(CurrentInstr.Instr & 0xFF);
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    OpArg imm = Imm32(CurInstr.Instr & 0xFF);
 
     Comp_AddCycles_C();
 
     switch (op)
     {
-        case 0x0:
-            MOV(32, rd, imm);
-            TEST(32, rd, rd);
-            Comp_RetriveFlags(false, false, false);
-            return;
-        case 0x1:
-            Comp_CmpOp(2, rd, imm, false);
-            return;
-        case 0x2:
-            Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
-            return;
-        case 0x3:
-            Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
-            return;
+    case 0x0:
+        MOV(32, rd, imm);
+        TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    case 0x1:
+        Comp_CmpOp(2, rd, imm, false);
+        return;
+    case 0x2:
+        Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+        return;
+    case 0x3:
+        Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        return;
     }
 }
 
 void Compiler::T_Comp_ALU()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    u32 op = (CurrentInstr.Instr >> 6) & 0xF;
+    u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
         Comp_AddCycles_CI(1);
@@ -522,28 +528,62 @@ void Compiler::T_Comp_ALU()
 
 void Compiler::T_Comp_ALU_HiReg()
 {
-    OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8)));
-    OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF);
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    OpArg rdMapped = MapReg(rd);
+    OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
 
-    u32 op = (CurrentInstr.Instr >> 8) & 0x3;
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
 
     Comp_AddCycles_C();
 
     switch (op)
     {
-        case 0x0: // ADD
-            Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV);
-            return;
-        case 0x1: // CMP
-            Comp_CmpOp(2, rd, rs, false);
-            return;
-        case 0x2: // MOV
-            if (rd != rs)
-                MOV(32, rd, rs);
-            TEST(32, rd, rd);
-            Comp_RetriveFlags(false, false, false);
-            return;
+    case 0x0: // ADD
+        Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        break;
+    case 0x1: // CMP
+        Comp_CmpOp(2, rdMapped, rs, false);
+        return; // this is on purpose
+    case 0x2: // MOV
+        if (rdMapped != rs)
+            MOV(32, rdMapped, rs);
+        TEST(32, rdMapped, rdMapped);
+        Comp_RetriveFlags(false, false, false);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        OR(32, rdMapped, Imm8(1));
+        Comp_JumpTo(rdMapped.GetSimpleReg());
     }
 }
 
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    OpArg sp = MapReg(13);
+    OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2);
+    if (CurInstr.Instr & (1 << 7))
+        SUB(32, sp, offset);
+    else
+        ADD(32, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        OpArg sp = MapReg(13);
+        LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset));
+    }
+    else
+        MOV(32, rd, Imm32((R15 & ~2) + offset));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 9096397..b7358a2 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,7 +9,7 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = 
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
     RBX, RSI, RDI, R12, R13
@@ -18,7 +18,7 @@ const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 #endif
 };
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
     5
 #else
@@ -30,24 +30,33 @@ Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 16);
 
-    for (int i = 0; i < 15; i++)
+    for (int i = 0; i < 3; i++)
     {
-        ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i);
-        WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i);
         for (int j = 0; j < 2; j++)
         {
-            ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i);
-            WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i);
+            MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
+            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
+            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
         }
     }
-    ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000);
-    WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000);
-    ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000);
-    WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000);
 
     ResetStart = GetWritableCodePtr();
 }
 
+DataRegion Compiler::ClassifyAddress(u32 addr)
+{
+    if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
+        return dataRegionDTCM;
+    switch (addr & 0xFF000000)
+    {
+        case 0x02000000: return dataRegionMainRAM;
+        case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM;
+        case 0x04000000: return dataRegionIO;
+        case 0x06000000: return dataRegionVRAM;
+    }
+    return dataRegionGeneric;
+}
+
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -92,6 +101,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     Num = cpu->Num;
     R15 = cpu->R[15];
     CodeRegion = cpu->CodeRegion;
+    CurCPU = cpu;
 
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
@@ -106,27 +116,32 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
-        CurrentInstr = instrs[i];
-
-        CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind);
+        CurInstr = instrs[i];
 
-        if (CurrentInstr.Info.Branches())
-            comp = NULL;
+        CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
 
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
             if (i == instrsCount - 1)
             {
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0]));
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
             }
 
-            SaveCPSR();
+            if (comp == NULL || CurInstr.Info.Branches())
+                SaveCPSR();
         }
 
+        // run interpreter
+        cpu->CodeCycles = CurInstr.CodeCycles;
+        cpu->R[15] = R15;
+        cpu->CurInstr = CurInstr.Instr;
+        cpu->NextInstr[0] = CurInstr.NextInstr[0];
+        cpu->NextInstr[1] = CurInstr.NextInstr[1];
+
         if (comp != NULL)
             RegCache.Prepare(i);
         else
@@ -134,26 +149,33 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
+            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF;
                 ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
             else
                 (this->*comp)();
+
+            ARMInterpreter::THUMBInstrTable[icode](cpu);
         }
         else
         {
-            u32 cond = CurrentInstr.Cond();
-            if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
                 ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+
+                ARMInterpreter::A_BLX_IMM(cpu);
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+                cpu->AddCycles_C();
+            }
             else
             {
                 FixupBranch skipExecute;
@@ -180,18 +202,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                 }
 
+                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0);
                     ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
                 }
                 else
                     (this->*comp)();
 
                 FixupBranch skipFailed;
-                if (CurrentInstr.Cond() < 0xE)
+                if (CurInstr.Cond() < 0xE)
                 {
                     skipFailed = J();
                     SetJumpTarget(skipExecute);
@@ -200,13 +222,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                     SetJumpTarget(skipFailed);
                 }
+
+                if (cpu->CheckCondition(cond))
+                    ARMInterpreter::ARMInstrTable[icode](cpu);
+                else
+                    cpu->AddCycles_C();
             }
         }
 
         /*
             we don't need to collect the interpreted cycles,
-            since all functions only add to it, the dispatcher
-            takes care of it.
+            since cpu->Cycles is taken into account by the dispatcher.
         */
 
         if (comp == NULL && i != instrsCount - 1)
@@ -277,29 +303,29 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // Mul
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
-        NULL, NULL, NULL, NULL, NULL, 
+        NULL, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRB
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDRB
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRD
         NULL, NULL, NULL, NULL,
         // STRD
         NULL, NULL, NULL, NULL,
         // LDRH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSB
-        NULL, NULL, NULL, NULL,
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // swap
-        NULL, NULL, 
+        NULL, NULL,
         // LDM/STM
         NULL, NULL,
         // Branch
@@ -314,26 +340,26 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // Three operand ADD/SUB
         T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
         // 8 bit imm
-        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, 
+        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8,
         // general ALU
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
         // hi reg
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
-        NULL, NULL, NULL, 
+        T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
         // LDR pcrel
-        NULL, 
+        NULL,
         // LDR/STR reg offset
-        T_Comp_MemReg, NULL, T_Comp_MemReg, NULL,
-        // LDR/STR sign extended, half 
-        NULL, NULL, NULL, NULL,
+        T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
+        // LDR/STR sign extended, half
+        T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf,
         // LDR/STR imm offset
-        T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, 
+        T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
         // LDR/STR half imm offset
-        NULL, NULL,
+        T_Comp_MemImmHalf, T_Comp_MemImmHalf,
         // branch, etc.
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
@@ -346,10 +372,10 @@ CompileFunc Compiler::GetCompFunc(int kind)
 void Compiler::Comp_AddCycles_C()
 {
     s32 cycles = Num ?
-        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
-        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles);
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (CurrentInstr.Cond() < 0xE)
+    if (CurInstr.Cond() < 0xE)
         ADD(32, R(RCycles), Imm8(cycles));
     else
         ConstantCycles += cycles;
@@ -358,13 +384,26 @@ void Compiler::Comp_AddCycles_C()
 void Compiler::Comp_AddCycles_CI(u32 i)
 {
     s32 cycles = (Num ?
-        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2]
-        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i;
-    
-    if (CurrentInstr.Cond() < 0xE)
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
+
+    if (CurInstr.Cond() < 0xE)
         ADD(32, R(RCycles), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
+void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
+{
+    SaveCPSR();
+
+    MOV(64, R(ABI_PARAM1), R(RCPU));
+    MOV(32, R(ABI_PARAM2), R(addr));
+    MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+    if (Num == 0)
+        CALL((void*)&ARMv5::JumpTo);
+    else
+        CALL((void*)&ARMv4::JumpTo);
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 7ab9b25..9395a29 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -6,6 +6,8 @@
 #include "../ARMJIT.h"
 #include "../ARMJIT_RegCache.h"
 
+#include <tuple>
+
 namespace ARMJIT
 {
 
@@ -21,6 +23,19 @@ class Compiler;
 
 typedef void (Compiler::*CompileFunc)();
 
+enum DataRegion
+{
+    dataRegionGeneric, // hey, that's me!
+    dataRegionMainRAM,
+    dataRegionSWRAM,
+    dataRegionVRAM,
+    dataRegionIO,
+    dataRegionExclusive,
+    dataRegionsCount,
+    dataRegionDTCM = dataRegionExclusive,
+    dataRegionWRAM7 = dataRegionExclusive,
+};
+
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -34,6 +49,8 @@ public:
 private:
     CompileFunc GetCompFunc(int kind);
 
+    void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
+
     void Comp_AddCycles_C();
     void Comp_AddCycles_CI(u32 i);
 
@@ -47,11 +64,14 @@ private:
         opInvertOp2 = 1 << 5,
     };
 
+    DataRegion ClassifyAddress(u32 addr);
+
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
     void A_Comp_MemWB();
+    void A_Comp_MemHalf();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -59,8 +79,15 @@ private:
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
 
+    void T_Comp_RelAddr();
+    void T_Comp_AddSP();
+
     void T_Comp_MemReg();
     void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+
+    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -70,8 +97,8 @@ private:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void* Gen_MemoryRoutine9(bool store, int size, u32 region);
-    void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region);
+    void* Gen_MemoryRoutine9(bool store, int size);
+    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
@@ -92,10 +119,12 @@ private:
     }
 
     void* ResetStart;
+    void* MemoryFuncs9[3][2];
+    void* MemoryFuncs7[3][2][2];
 
     bool CPSRDirty = false;
 
-    FetchedInstr CurrentInstr;
+    FetchedInstr CurInstr;
 
     RegCache<Compiler, Gen::X64Reg> RegCache;
 
@@ -105,12 +134,9 @@ private:
     u32 CodeRegion;
 
     u32 ConstantCycles;
-};
 
-extern void* ReadMemFuncs9[16];
-extern void* ReadMemFuncs7[2][16];
-extern void* WriteMemFuncs9[16];
-extern void* WriteMemFuncs7[2][16];
+    ARM* CurCPU;
+};
 
 }
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index d534269..69746e2 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -5,7 +5,6 @@
 
 namespace NDS
 {
-#define MAIN_RAM_SIZE 0x400000
 extern u8* SWRAM_ARM9;
 extern u32 SWRAM_ARM9Mask;
 extern u8* SWRAM_ARM7;
@@ -19,11 +18,6 @@ using namespace Gen;
 namespace ARMJIT
 {
 
-void* ReadMemFuncs9[16];
-void* ReadMemFuncs7[2][16];
-void* WriteMemFuncs9[16];
-void* WriteMemFuncs7[2][16];
-
 template <typename T>
 int squeezePointer(T* ptr)
 {
@@ -32,569 +26,434 @@ int squeezePointer(T* ptr)
     return truncated;
 }
 
-u32 ReadVRAM9(u32 addr)
-{
-    switch (addr & 0x00E00000)
-    {
-        case 0x00000000: return GPU::ReadVRAM_ABG<u32>(addr);
-        case 0x00200000: return GPU::ReadVRAM_BBG<u32>(addr);
-        case 0x00400000: return GPU::ReadVRAM_AOBJ<u32>(addr);
-        case 0x00600000: return GPU::ReadVRAM_BOBJ<u32>(addr);
-        default:         return GPU::ReadVRAM_LCDC<u32>(addr);
-    }
-}
+/*
+    According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
+    of all memory load and store instructions always access addresses in the same region as
+    during the their first execution.
 
-void WriteVRAM9(u32 addr, u32 val)
-{
-    switch (addr & 0x00E00000)
-    {
-        case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
-        case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
-        case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
-        case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
-    }
-}
+    I tried multiple optimisations, which would benefit from this behaviour
+    (having fast paths for the first region, …), though none of them yielded a measureable
+    improvement.
+*/
 
 /*
-    R11 - data to write (store only)
-    RSCRATCH2 - address
-    RSCRATCH3 - code cycles
+    address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
+    store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
+    code cycles - ABI_PARAM3
 */
-void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region)
+void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
+    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
     AlignCode4();
-    void* res = (void*)GetWritableCodePtr();
+    void* res = GetWritableCodePtr();
 
-    if (!store)
-    {
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        AND(32, R(RSCRATCH), Imm8(0x3));
-        SHL(32, R(RSCRATCH), Imm8(3));
-        // enter the shadow realm!
-        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
-    }
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+    FixupBranch insideDTCM = J_CC(CC_B);
 
-    // cycle counting!
-    // this is AddCycles_CDI
-    MOV(32, R(R10), R(RSCRATCH2));
-    SHR(32, R(R10), Imm8(12));
-    MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2));
-    LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6));
-    CMP(32, R(R10), R(RSCRATCH3));
-    CMOVcc(32, RSCRATCH3, R(R10), CC_G);
-    CMP(32, R(RSCRATCH), R(RSCRATCH3));
-    CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G);
-    ADD(32, R(RCycles), R(RSCRATCH3));
-
-    if (!store)
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-    AND(32, R(RSCRATCH2), Imm32(~3));
+    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+    FixupBranch insideITCM = J_CC(CC_B);
 
+    // cycle counting!
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(12));
+    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0)));
+    LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6));
+    CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+    CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
+    CMP(32, R(ABI_PARAM4), R(RSCRATCH));
+    CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G);
+    ADD(32, R(RCycles), R(RSCRATCH));
+
+    if (store)
     {
-        MOV(32, R(RSCRATCH3), R(RSCRATCH2));
-        SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-        CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-        FixupBranch outsideDTCM = J_CC(CC_AE);
-        AND(32, R(RSCRATCH2), Imm32(0x3FFF));
-        if (!store)
+        if (size > 8)
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+        switch (size)
         {
-            MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)));
-            MOV(32, R(ECX), MDisp(RSP, 8));
-            ROR_(32, R(RSCRATCH), R(ECX));
+        case 32: JMP((u8*)NDS::ARM9Write32, true); break;
+        case 16: JMP((u8*)NDS::ARM9Write16, true); break;
+        case 8: JMP((u8*)NDS::ARM9Write8, true); break;
         }
-        else
-            MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11));
-        RET();
-        SetJumpTarget(outsideDTCM);
-        MOV(32, R(RSCRATCH2), R(RSCRATCH3));
     }
-
-    switch (region)
+    else
     {
-    case 0x00000000:
-    case 0x01000000:
-        {
-            CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-            FixupBranch insideITCM = J_CC(CC_B);
-            RET();
-            SetJumpTarget(insideITCM);
-            AND(32, R(RSCRATCH2), Imm32(0x7FFF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)));
-            else
-            {
-                MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0));
-            }
-        }
-        break;
-    case 0x02000000:
-        AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
-        if (!store)
-            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
-        else
-        {
-            MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
-            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
-            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
-        }
-        break;
-    case 0x03000000:
-        {
-            MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9));
-            TEST(64, R(RSCRATCH3), R(RSCRATCH3));
-            FixupBranch notMapped = J_CC(CC_Z);
-            AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask));
-            if (!store)
-                MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3));
-            else
-            {
-                MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
-            }
-            SetJumpTarget(notMapped);
-        }
-        break;
-    case 0x04000000:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
-        {
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            ABI_CallFunction(NDS::ARM9IORead32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
-        }
-        else
-        {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)NDS::ARM9IOWrite32, true);
-        }
-        break;
-    case 0x05000000:
-        {        
-            MOV(32, R(RSCRATCH), Imm32(1<<1));
-            MOV(32, R(RSCRATCH3), Imm32(1<<9));
-            TEST(32, R(RSCRATCH2), Imm32(0x400));
-            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
-            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
-            FixupBranch available = J_CC(CC_NZ);
-            RET();
-            SetJumpTarget(available);
-            AND(32, R(RSCRATCH2), Imm32(0x7FF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette)));
-            else
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11));
-        }
-        break;
-    case 0x06000000:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
-        { 
-            ABI_PushRegistersAndAdjustStack({}, 8);
-            ABI_CallFunction(ReadVRAM9);
-            ABI_PopRegistersAndAdjustStack({}, 8);
-        }
-        else
-        {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)WriteVRAM9, true);
-        }
-        break;
-    case 0x07000000:
+        if (size == 32)
         {
-            MOV(32, R(RSCRATCH), Imm32(1<<1));
-            MOV(32, R(RSCRATCH3), Imm32(1<<9));
-            TEST(32, R(RSCRATCH2), Imm32(0x400));
-            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
-            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
-            FixupBranch available = J_CC(CC_NZ);
+            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            // everything's already in the appropriate register
+            ABI_CallFunction(NDS::ARM9Read32);
+            ABI_PopRegistersAndAdjustStack({ECX}, 8);
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
             RET();
-            SetJumpTarget(available);
-            AND(32, R(RSCRATCH2), Imm32(0x7FF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM)));
-            else
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11));
         }
-        break;
-    case 0x08000000:
-    case 0x09000000:
-    case 0x0A000000:
-        if (!store)
-            MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-        break;
-    case 0xFF000000:
-        if (!store)
-        {
-            AND(32, R(RSCRATCH2), Imm32(0xFFF));
-            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS)));
-        }
-        break;
-    default:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
+        else if (size == 16)
         {
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            ABI_CallFunction(NDS::ARM9Read32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            JMP((u8*)NDS::ARM9Read16, true);
         }
         else
+            JMP((u8*)NDS::ARM9Read8, true);
+    }
+
+    SetJumpTarget(insideDTCM);
+    ADD(32, R(RCycles), R(ABI_PARAM3));
+    AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
+    if (store)
+        MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
+        if (size == 32)
         {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)NDS::ARM9Write32, true);
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
         }
-        break;
     }
+    RET();
 
-    if (!store)
+    SetJumpTarget(insideITCM);
+    ADD(32, R(RCycles), R(ABI_PARAM3));
+    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
+    AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
+    if (store)
     {
-        MOV(32, R(ECX), MDisp(RSP, 8));
-        ROR_(32, R(RSCRATCH), R(ECX));
+        MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
+        if (size == 32)
+            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+    }
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)));
+        if (size == 32)
+        {
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
     }
-
     RET();
 
+    static_assert(RSCRATCH == EAX);
+
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region)
+void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
 {
+    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
     AlignCode4();
     void* res = GetWritableCodePtr();
 
-    if (!store)
-    {
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        AND(32, R(RSCRATCH), Imm8(0x3));
-        SHL(32, R(RSCRATCH), Imm8(3));
-        // enter the shadow realm!
-        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
-    }
-
-    // AddCycles_CDI
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2)));
-    if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode))
+    MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0xFF000000));
+    CMP(32, R(RSCRATCH), Imm32(0x02000000));
+    FixupBranch outsideMainRAM = J_CC(CC_NE);
+    if (codeMainRAM)
     {
-        if (!store && region != 0x02000000)
-            LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1));
-        ADD(32, R(RCycles), R(RSCRATCH3));
+        LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3));
+        ADD(32, R(RCycles), R(RSCRATCH));
     }
     else
     {
         if (!store)
-            ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1));
-        LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3));
-        CMP(32, R(RSCRATCH3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G);
-        CMP(32, R(R10), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(R10), CC_G);
+            ADD(32, R(ABI_PARAM3), Imm8(1));
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
+        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
+        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
+        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
         ADD(32, R(RCycles), R(RSCRATCH));
     }
-
-    if (!store)
+    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
+    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
+    if (store)
+    {
+        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
         XOR(32, R(RSCRATCH), R(RSCRATCH));
-    AND(32, R(RSCRATCH2), Imm32(~3));
+        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
+        if (size == 32)
+            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
+    }
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
+        if (size == 32)
+        {
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
+    }
+    RET();
 
-    switch (region)
+    SetJumpTarget(outsideMainRAM);
+    if (codeMainRAM)
+    {
+        if (!store)
+            ADD(32, R(ABI_PARAM4), Imm8(1));
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
+        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
+        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
+        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+    else
+    {
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1));
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+    if (store)
+    {
+        if (size > 8)
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+        switch (size)
+        {
+        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
+        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
+        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
+        }
+    }
+    else
     {
-        case 0x00000000:
-            if (!store) {
-                CMP(32, R(RSCRATCH2), Imm32(0x4000));
-                FixupBranch outsideBIOS1 = J_CC(CC_AE);
-
-                MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15])));
-                CMP(32, R(RSCRATCH), Imm32(0x4000));
-                FixupBranch outsideBIOS2 = J_CC(CC_AE);
-                MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt));
-                CMP(32, R(RSCRATCH2), R(RSCRATCH3));
-                FixupBranch notDenied1 = J_CC(CC_AE);
-                CMP(32, R(RSCRATCH), R(RSCRATCH3));
-                FixupBranch notDenied2 = J_CC(CC_B);
-                SetJumpTarget(outsideBIOS2);
-                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-                RET();
-
-                SetJumpTarget(notDenied1);
-                SetJumpTarget(notDenied2);
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS)));
-                MOV(32, R(ECX), MDisp(RSP, 8));
-                ROR_(32, R(RSCRATCH), R(ECX));
-                RET();
-
-                SetJumpTarget(outsideBIOS1);
-            }
-            break;
-        case 0x02000000:
-            AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
-            else
-            {
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
-            }
-            break;
-        case 0x03000000:
-            {
-                TEST(32, R(RSCRATCH2), Imm32(0x800000));
-                FixupBranch region = J_CC(CC_NZ);
-                MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7));
-                TEST(64, R(RSCRATCH), R(RSCRATCH));
-                FixupBranch notMapped = J_CC(CC_Z);
-                AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask));
-                if (!store)
-                {
-                    MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2));
-                    MOV(32, R(ECX), MDisp(RSP, 8));
-                    ROR_(32, R(RSCRATCH), R(ECX));
-                }
-                else
-                {
-                    MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
-                }
-                RET();
-                SetJumpTarget(region);
-                SetJumpTarget(notMapped);
-                AND(32, R(RSCRATCH2), Imm32(0xFFFF));
-                if (!store)
-                    MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)));
-                else
-                {
-                    MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0));
-                }
-            }
-            break;
-        case 0x04000000:
-            {
-                TEST(32, R(RSCRATCH2), Imm32(0x800000));
-                FixupBranch region = J_CC(CC_NZ);
-                MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                if (!store)
-                {
-                    ABI_PushRegistersAndAdjustStack({}, 8);
-                    ABI_CallFunction(NDS::ARM7IORead32);
-                    ABI_PopRegistersAndAdjustStack({}, 8);
-
-                    MOV(32, R(ECX), MDisp(RSP, 8));
-                    ROR_(32, R(RSCRATCH), R(ECX));
-                    RET();
-                }
-                else
-                {
-                    MOV(32, R(ABI_PARAM2), R(R11));
-                    JMP((u8*)NDS::ARM7IOWrite32, true);
-                }
-                SetJumpTarget(region);
-
-                if (!store)
-                {
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    ABI_CallFunction(Wifi::Read);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8);
-
-                    ADD(32, R(RSCRATCH2), Imm8(2));
-                    ABI_PushRegistersAndAdjustStack({EAX}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    ABI_CallFunction(Wifi::Read);
-                    MOV(32, R(RSCRATCH2), R(EAX));
-                    SHL(32, R(RSCRATCH2), Imm8(16));
-                    ABI_PopRegistersAndAdjustStack({EAX}, 8);
-                    OR(32, R(EAX), R(RSCRATCH2));
-                }
-                else
-                {
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    MOVZX(32, 16, ABI_PARAM2, R(R11));
-                    ABI_CallFunction(Wifi::Write);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    SHR(32, R(R11), Imm8(16));
-                    ADD(32, R(RSCRATCH2), Imm8(2));
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    MOVZX(32, 16, ABI_PARAM2, R(R11));
-                    ABI_CallFunction(Wifi::Write);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                }
-            }
-            break;
-        case 0x06000000:
-            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-            if (!store)
-            {
-                ABI_PushRegistersAndAdjustStack({}, 8);
-                ABI_CallFunction(GPU::ReadVRAM_ARM7<u32>);
-                ABI_PopRegistersAndAdjustStack({}, 8);
-            }
-            else
-            {
-                AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1));
-                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0));
-                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0));
-                MOV(32, R(ABI_PARAM2), R(R11));
-                JMP((u8*)GPU::WriteVRAM_ARM7<u32>, true);
-            }
-            break;
-        case 0x08000000:
-        case 0x09000000:
-        case 0x0A000000:
-            if (!store)
-                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-            break;
-        /*default:
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (size == 32)
+        {
+            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
             ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
-            break;*/
+            ABI_PopRegistersAndAdjustStack({ECX}, 8);
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+            RET();
+        }
+        else if (size == 16)
+        {
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            JMP((u8*)NDS::ARM7Read16, true);
+        }
+        else
+            JMP((u8*)NDS::ARM7Read8, true);
     }
 
+    return res;
+}
+
+void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size)
+{
+    if (store)
+        MOV(32, R(ABI_PARAM2), rd);
+    u32 cycles = Num
+        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+    MOV(32, R(ABI_PARAM3), Imm32(cycles));
+    CALL(Num == 0
+        ? MemoryFuncs9[size >> 4][store]
+        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+
     if (!store)
     {
-        MOV(32, R(ECX), MDisp(RSP, 8));
-        ROR_(32, R(RSCRATCH), R(ECX));
+        if (signExtend)
+            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        else
+            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
     }
-
-    RET();
-
-    return res;
 }
 
 OpArg Compiler::A_Comp_GetMemWBOffset()
 {
-    if (!(CurrentInstr.Instr & (1 << 25)))
-        return Imm32(CurrentInstr.Instr & 0xFFF);
+    if (!(CurInstr.Instr & (1 << 25)))
+    {
+        u32 imm = CurInstr.Instr & 0xFFF;
+        return Imm32(imm);
+    }
     else
     {
-        int op = (CurrentInstr.Instr >> 5) & 0x3;
-        int amount = (CurrentInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        OpArg rm = MapReg(CurInstr.A_Reg(0));
         bool carryUsed;
+
         return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
     }
 }
 
 void Compiler::A_Comp_MemWB()
-{    
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
-    bool load = CurrentInstr.Instr & (1 << 20);
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+    int size = byte ? 8 : 32;
 
-    MOV(32, R(RSCRATCH2), rn);
-    if (CurrentInstr.Instr & (1 << 24))
+    if (CurInstr.Instr & (1 << 24))
     {
         OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurrentInstr.Instr & (1 << 23))
-            ADD(32, R(RSCRATCH2), offset);
+        if (CurInstr.Instr & (1 << 23))
+            MOV_sum(32, ABI_PARAM1, rn, offset);
         else
-            SUB(32, R(RSCRATCH2), offset);
+        {
+            MOV(32, R(ABI_PARAM1), rn);
+            SUB(32, R(ABI_PARAM1), offset);
+        }
 
-        if (CurrentInstr.Instr & (1 << 21))
-            MOV(32, rn, R(RSCRATCH2));
+        if (CurInstr.Instr & (1 << 21))
+            MOV(32, rn, R(ABI_PARAM1));
     }
-
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles;
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
     else
+        MOV(32, R(ABI_PARAM1), rn);
+
+    if (!(CurInstr.Instr & (1 << 24)))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+
+        if (CurInstr.Instr & (1 << 23))
+            ADD(32, rn, offset);
+        else
+            SUB(32, rn, offset);
+    }
+
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    if (load && CurInstr.A_Reg(12) == 15)
+    {
+        if (byte)
+            printf("!!! LDRB PC %08X\n", R15);
+        else
+        {
+            if (Num == 1)
+                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
+            Comp_JumpTo(rd.GetSimpleReg());
+        }
+    }
+}
+
+void Compiler::A_Comp_MemHalf()
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    OpArg offset = CurInstr.Instr & (1 << 22)
+        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : MapReg(CurInstr.A_Reg(0));
+
+    if (CurInstr.Instr & (1 << 24))
     {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
+        if (CurInstr.Instr & (1 << 23))
+            MOV_sum(32, ABI_PARAM1, rn, offset);
+        else
+        {
+            MOV(32, R(ABI_PARAM1), rn);
+            SUB(32, R(ABI_PARAM1), offset);
+        }
+        
+        if (CurInstr.Instr & (1 << 21))
+            MOV(32, rn, R(ABI_PARAM1));
     }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    else
+        MOV(32, R(ABI_PARAM1), rn);
 
-    if (load)
-        MOV(32, R(RSCRATCH2), R(RSCRATCH));
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    bool load = CurInstr.Instr & (1 << 20);
 
-    if (!(CurrentInstr.Instr & (1 << 24)))
+    bool signExtend = false;
+    int size;
+    if (!load && op == 1)
+        size = 16;
+    else if (load)
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        size = op == 2 ? 8 : 16;
+        signExtend = op > 1;
+    }
 
-        if (CurrentInstr.Instr & (1 << 23))
+    if (!(CurInstr.Instr & (1 << 24)))
+    {
+        if (CurInstr.Instr & (1 << 23))
             ADD(32, rn, offset);
         else
             SUB(32, rn, offset);
     }
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH2));
+    Comp_MemAccess(rd, signExtend, !load, size);
+
+    if (load && CurInstr.A_Reg(12) == 15)
+        printf("!!! MemHalf op PC %08X\n", R15);;
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
-    OpArg ro = MapReg(CurrentInstr.T_Reg(6));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+    OpArg ro = MapReg(CurInstr.T_Reg(6));
 
-    int op = (CurrentInstr.Instr >> 10) & 0x3;
+    int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
-    
-    MOV(32, R(RSCRATCH2), rb);
-    ADD(32, R(RSCRATCH2), ro);
-
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
-    else
-    {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
-    }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    bool byte = op & 0x1;
+
+    MOV_sum(32, ABI_PARAM1, rb, ro);
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH));
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
 }
 
 void Compiler::T_Comp_MemImm()
 {
-    // TODO: aufräumen!!!
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
-    
-    int op = (CurrentInstr.Instr >> 11) & 0x3;
-    u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4;
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+
+    int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset));
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
-    else
-    {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
-    }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
+
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+    OpArg ro = MapReg(CurInstr.T_Reg(6));
+
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    MOV_sum(32, ABI_PARAM1, rb, ro);
+
+    Comp_MemAccess(rd, signExtend, !load, size);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH));
+    Comp_MemAccess(rd, false, !load, 16);
 }
 
 }
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 41c46e1..32a9645 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -317,7 +317,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     else
     {
         u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
-        if ((instr & 0xFE000000) == 0xFA000000)
+        if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
 
         if (data & A_ARM9Only && num != 0)
diff --git a/src/NDS.cpp b/src/NDS.cpp
index b8fd8cb..baa5e0d 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -524,6 +524,8 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+    ARMJIT::ResetBlocks();
+
     NDSCart::Reset();
     GBACart::Reset();
     GPU::Reset();
-- 
cgit v1.2.3


From ff9721111441e69b4a276a34c757476b625213c6 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Wed, 10 Jul 2019 00:57:59 +0200
Subject: jit: thumb block transfer working also pc and sp relative loads and
 some refactoring

---
 src/ARMJIT_RegCache.h               | 136 ----------
 src/ARMJIT_RegisterCache.h          | 136 ++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  82 ++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  19 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 515 +++++++++++++++++++++++++++++++-----
 src/ARM_InstrInfo.cpp               |  46 ++--
 6 files changed, 682 insertions(+), 252 deletions(-)
 delete mode 100644 src/ARMJIT_RegCache.h
 create mode 100644 src/ARMJIT_RegisterCache.h

(limited to 'src')

diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
deleted file mode 100644
index 556d27b..0000000
--- a/src/ARMJIT_RegCache.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef ARMJIT_REGCACHE_H
-#define ARMJIT_REGCACHE_H
-
-#include "ARMJIT.h"
-
-// TODO: replace this in the future
-#include "dolphin/BitSet.h"
-
-#include <assert.h>
-
-namespace ARMJIT
-{
-
-template <typename T, typename Reg>
-class RegCache
-{
-public:
-    RegCache()
-    {}
-
-	RegCache(T* compiler, FetchedInstr instrs[], int instrsCount)
-		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
-    {
-        for (int i = 0; i < 16; i++)
-            Mapping[i] = (Reg)-1;
-    }
-
-    void UnloadRegister(int reg)
-    {
-        assert(Mapping[reg] != -1);
-
-        if (DirtyRegs & (1 << reg))
-            Compiler->SaveReg(reg, Mapping[reg]);
-
-        DirtyRegs &= ~(1 << reg);
-        LoadedRegs &= ~(1 << reg);
-        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
-        Mapping[reg] = (Reg)-1;
-    }
-
-    void LoadRegister(int reg)
-    {
-        assert(Mapping[reg] == -1);
-        for (int i = 0; i < NativeRegsAvailable; i++)
-        {
-            Reg nativeReg = NativeRegAllocOrder[i];
-            if (!(NativeRegsUsed & (1 << nativeReg)))
-            {
-                Mapping[reg] = nativeReg;
-                NativeRegsUsed |= 1 << (int)nativeReg;
-                LoadedRegs |= 1 << reg;
-
-                Compiler->LoadReg(reg, nativeReg);
-
-                return;
-            }
-        }
-
-        assert("Welp!");
-    }
-
-    void Flush()
-    {
-        BitSet16 loadedSet(LoadedRegs);
-        for (int reg : loadedSet)
-            UnloadRegister(reg);
-    }
-
-	void Prepare(int i)
-    {
-        u16 futureNeeded = 0;
-        int ranking[16];
-        for (int j = 0; j < 16; j++)
-            ranking[j] = 0;
-        for (int j = i; j < InstrsCount; j++)
-        {
-            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
-            futureNeeded |= regsNeeded.m_val;
-            for (int reg : regsNeeded)
-                ranking[reg]++;
-        }
-
-        // we'll unload all registers which are never used again
-        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
-        for (int reg : neverNeededAgain)
-            UnloadRegister(reg);
-
-		FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
-        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
-        if (needToBeLoaded != BitSet16(0))
-        {
-            int neededCount = needToBeLoaded.Count();
-            BitSet16 loadedSet(LoadedRegs);
-            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
-            {
-                int leastReg = -1;
-                int rank = 1000;
-                for (int reg : loadedSet)
-                {
-                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
-                    {
-                        leastReg = reg;
-                        rank = ranking[reg];
-                    }
-                }
-
-                assert(leastReg != -1);
-                UnloadRegister(leastReg);
-
-                loadedSet.m_val = LoadedRegs;
-            }
-
-            for (int reg : needToBeLoaded)
-                LoadRegister(reg);
-        }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
-    }
-
-	static const Reg NativeRegAllocOrder[];
-	static const int NativeRegsAvailable;
-
-	Reg Mapping[16];
-	u32 NativeRegsUsed = 0;
-	u16 LoadedRegs = 0;
-	u16 DirtyRegs = 0;
-
-	T* Compiler;
-
-	FetchedInstr* Instrs;
-	int InstrsCount;
-};
-
-}
-
-#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
new file mode 100644
index 0000000..04c1eda
--- /dev/null
+++ b/src/ARMJIT_RegisterCache.h
@@ -0,0 +1,136 @@
+#ifndef ARMJIT_REGCACHE_H
+#define ARMJIT_REGCACHE_H
+
+#include "ARMJIT.h"
+
+// TODO: replace this in the future
+#include "dolphin/BitSet.h"
+
+#include <assert.h>
+
+namespace ARMJIT
+{
+
+template <typename T, typename Reg>
+class RegisterCache
+{
+public:
+    RegisterCache()
+    {}
+
+	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
+    {
+        for (int i = 0; i < 16; i++)
+            Mapping[i] = (Reg)-1;
+    }
+
+    void UnloadRegister(int reg)
+    {
+        assert(Mapping[reg] != -1);
+
+        if (DirtyRegs & (1 << reg))
+            Compiler->SaveReg(reg, Mapping[reg]);
+
+        DirtyRegs &= ~(1 << reg);
+        LoadedRegs &= ~(1 << reg);
+        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
+        Mapping[reg] = (Reg)-1;
+    }
+
+    void LoadRegister(int reg)
+    {
+        assert(Mapping[reg] == -1);
+        for (int i = 0; i < NativeRegsAvailable; i++)
+        {
+            Reg nativeReg = NativeRegAllocOrder[i];
+            if (!(NativeRegsUsed & (1 << nativeReg)))
+            {
+                Mapping[reg] = nativeReg;
+                NativeRegsUsed |= 1 << (int)nativeReg;
+                LoadedRegs |= 1 << reg;
+
+                Compiler->LoadReg(reg, nativeReg);
+
+                return;
+            }
+        }
+
+        assert("Welp!");
+    }
+
+    void Flush()
+    {
+        BitSet16 loadedSet(LoadedRegs);
+        for (int reg : loadedSet)
+            UnloadRegister(reg);
+    }
+
+	void Prepare(int i)
+    {
+        u16 futureNeeded = 0;
+        int ranking[16];
+        for (int j = 0; j < 16; j++)
+            ranking[j] = 0;
+        for (int j = i; j < InstrsCount; j++)
+        {
+            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
+            futureNeeded |= regsNeeded.m_val;
+            for (int reg : regsNeeded)
+                ranking[reg]++;
+        }
+
+        // we'll unload all registers which are never used again
+        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
+        for (int reg : neverNeededAgain)
+            UnloadRegister(reg);
+
+		FetchedInstr Instr = Instrs[i];
+        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
+        if (needToBeLoaded != BitSet16(0))
+        {
+            int neededCount = needToBeLoaded.Count();
+            BitSet16 loadedSet(LoadedRegs);
+            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
+            {
+                int leastReg = -1;
+                int rank = 1000;
+                for (int reg : loadedSet)
+                {
+                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
+                    {
+                        leastReg = reg;
+                        rank = ranking[reg];
+                    }
+                }
+
+                assert(leastReg != -1);
+                UnloadRegister(leastReg);
+
+                loadedSet.m_val = LoadedRegs;
+            }
+
+            for (int reg : needToBeLoaded)
+                LoadRegister(reg);
+        }
+        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+    }
+
+	static const Reg NativeRegAllocOrder[];
+	static const int NativeRegsAvailable;
+
+	Reg Mapping[16];
+	u32 NativeRegsUsed = 0;
+	u16 LoadedRegs = 0;
+	u16 DirtyRegs = 0;
+
+	T* Compiler;
+
+	FetchedInstr* Instrs;
+	int InstrsCount;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index b7358a2..4fe0c70 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,20 +9,20 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
+const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
-    RBX, RSI, RDI, R12, R13
+    RBX, RSI, RDI, R12, R13, R14
 #else
-    RBX, R12, R13
+    RBX, R12, R13, R14 // this is sad
 #endif
 };
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable =
+const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
-    5
+    6
 #else
-    3
+    4
 #endif
 ;
 
@@ -39,10 +39,47 @@ Compiler::Compiler()
             MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
         }
     }
+    for (int i = 0; i < 2; i++)
+        for (int j = 0; j < 2; j++)
+        {
+            MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j);
+            MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false);
+            MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
+        }
 
     ResetStart = GetWritableCodePtr();
 }
 
+void* Compiler::Gen_ChangeCPSRRoutine()
+{
+    void* res = (void*)GetWritableCodePtr();
+
+    MOV(32, R(RSCRATCH), R(RCPSR));
+    AND(32, R(RSCRATCH), Imm8(0x1F));
+    CMP(32, R(RSCRATCH), Imm8(0x11));
+    FixupBranch fiq = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x12));
+    FixupBranch irq = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x13));
+    FixupBranch svc = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x17));
+    FixupBranch abt = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x1B));
+    FixupBranch und = J_CC(CC_E);
+
+    SetJumpTarget(fiq);
+
+    SetJumpTarget(irq);
+
+    SetJumpTarget(svc);
+
+    SetJumpTarget(abt);
+
+    SetJumpTarget(und);
+
+    return res;
+}
+
 DataRegion Compiler::ClassifyAddress(u32 addr)
 {
     if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
@@ -106,12 +143,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
-    XOR(32, R(RCycles), R(RCycles));
 
     LoadCPSR();
 
     // TODO: this is ugly as a whole, do better
-    RegCache = ARMJIT::RegCache<Compiler, X64Reg>(this, instrs, instrsCount);
+    RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
     {
@@ -242,7 +278,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     RegCache.Flush();
     SaveCPSR();
 
-    LEA(32, RAX, MDisp(RCycles, ConstantCycles));
+    MOV(32, R(RAX), Imm32(ConstantCycles));
 
     ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
     RET();
@@ -306,18 +342,20 @@ CompileFunc Compiler::GetCompFunc(int kind)
         NULL, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // STRB
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDR
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDRB
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRH
         A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRD
-        NULL, NULL, NULL, NULL,
-        // STRD
-        NULL, NULL, NULL, NULL,
+        // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // LDRH
         A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSB
@@ -360,10 +398,14 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
         // LDR/STR half imm offset
         T_Comp_MemImmHalf, T_Comp_MemImmHalf,
-        // branch, etc.
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL
+        // LDR/STR sp rel
+        NULL, NULL,
+        // PUSH/POP
+        NULL, NULL, 
+        // LDMIA, STMIA
+        NULL, NULL, 
+        NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL
     };
 
     return Thumb ? T_Comp[kind] : A_Comp[kind];
@@ -376,7 +418,7 @@ void Compiler::Comp_AddCycles_C()
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if (CurInstr.Cond() < 0xE)
-        ADD(32, R(RCycles), Imm8(cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -388,13 +430,15 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (CurInstr.Cond() < 0xE)
-        ADD(32, R(RCycles), Imm8(cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
+    // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert
+    // wird der alte Wert gespeichert
     SaveCPSR();
 
     MOV(64, R(ABI_PARAM1), R(RCPU));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9395a29..a751737 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,7 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
-#include "../ARMJIT_RegCache.h"
+#include "../ARMJIT_RegisterCache.h"
 
 #include <tuple>
 
@@ -12,7 +12,6 @@ namespace ARMJIT
 {
 
 const Gen::X64Reg RCPU = Gen::RBP;
-const Gen::X64Reg RCycles = Gen::R14;
 const Gen::X64Reg RCPSR = Gen::R15;
 
 const Gen::X64Reg RSCRATCH = Gen::EAX;
@@ -72,6 +71,7 @@ private:
 
     void A_Comp_MemWB();
     void A_Comp_MemHalf();
+    void A_Comp_LDM_STM();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -86,8 +86,13 @@ private:
     void T_Comp_MemImm();
     void T_Comp_MemRegHalf();
     void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+    void T_Comp_PUSH_POP();
+    void T_Comp_LDMIA_STMIA();
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -100,6 +105,11 @@ private:
     void* Gen_MemoryRoutine9(bool store, int size);
     void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
+    void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
+    void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
+
+    void* Gen_ChangeCPSRRoutine();
+
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
@@ -122,11 +132,14 @@ private:
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
+    void* MemoryFuncsSeq9[2][2];
+    void* MemoryFuncsSeq7[2][2][2];
+
     bool CPSRDirty = false;
 
     FetchedInstr CurInstr;
 
-    RegCache<Compiler, Gen::X64Reg> RegCache;
+    RegisterCache<Compiler, Gen::X64Reg> RegCache;
 
     bool Thumb;
     u32 Num;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 69746e2..20e1893 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -3,16 +3,6 @@
 #include "../GPU.h"
 #include "../Wifi.h"
 
-namespace NDS
-{
-extern u8* SWRAM_ARM9;
-extern u32 SWRAM_ARM9Mask;
-extern u8* SWRAM_ARM7;
-extern u32 SWRAM_ARM7Mask;
-extern u8 ARM7WRAM[];
-extern u16 ARM7BIOSProt;
-}
-
 using namespace Gen;
 
 namespace ARMJIT
@@ -41,6 +31,49 @@ int squeezePointer(T* ptr)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
     code cycles - ABI_PARAM3
 */
+
+#define CALC_CYCLES_9(numC, numD, scratch) \
+    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
+    CMP(32, R(numC), R(numD)); \
+    CMOVcc(32, numD, R(numC), CC_G); \
+    CMP(32, R(numD), R(scratch)); \
+    CMOVcc(32, scratch, R(numD), CC_G); \
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
+#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
+    if (codeMainRAM) \
+    { \
+        LEA(32, scratch, MRegSum(numD, numC)); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    } \
+    else \
+    { \
+        if (!store) \
+            ADD(32, R(numC), Imm8(1)); \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
+        CMP(32, R(numD), R(numC)); \
+        CMOVcc(32, numC, R(numD), CC_G); \
+        CMP(32, R(numC), R(scratch)); \
+        CMOVcc(32, scratch, R(numC), CC_G); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    }
+#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
+    if (codeMainRAM) \
+    { \
+        if (!store) \
+            ADD(32, R(numD), Imm8(1)); \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
+        CMP(32, R(numD), R(numC)); \
+        CMOVcc(32, numC, R(numD), CC_G); \
+        CMP(32, R(numC), R(scratch)); \
+        CMOVcc(32, scratch, R(numC), CC_G); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    } \
+    else \
+    { \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    }
+
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -56,15 +89,10 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     FixupBranch insideITCM = J_CC(CC_B);
 
     // cycle counting!
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0)));
-    LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6));
-    CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-    CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-    CMP(32, R(ABI_PARAM4), R(RSCRATCH));
-    CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G);
-    ADD(32, R(RCycles), R(RSCRATCH));
+    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
+    SHR(32, R(ABI_PARAM4), Imm8(12));
+    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
+    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
 
     if (store)
     {
@@ -101,7 +129,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, R(RCycles), R(ABI_PARAM3));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -120,7 +148,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, R(RCycles), R(ABI_PARAM3));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
@@ -158,28 +186,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
 
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
+    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
 
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     AND(32, R(RSCRATCH), Imm32(0xFF000000));
     CMP(32, R(RSCRATCH), Imm32(0x02000000));
     FixupBranch outsideMainRAM = J_CC(CC_NE);
-    if (codeMainRAM)
-    {
-        LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3));
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
-    else
-    {
-        if (!store)
-            ADD(32, R(ABI_PARAM3), Imm8(1));
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
-        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
-        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
-        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
+    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
     AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
     if (store)
@@ -205,22 +218,7 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     RET();
 
     SetJumpTarget(outsideMainRAM);
-    if (codeMainRAM)
-    {
-        if (!store)
-            ADD(32, R(ABI_PARAM4), Imm8(1));
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
-        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
-        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
-        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
-    else
-    {
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1));
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
+    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
     if (store)
     {
         if (size > 8)
@@ -257,7 +255,189 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     return res;
 }
 
-void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size)
+#define MEMORY_SEQ_WHILE_COND \
+        if (!store) \
+            MOV(32, currentElement, R(EAX));\
+        if (!preinc) \
+            ADD(32, R(ABI_PARAM1), Imm8(4)); \
+        \
+        SUB(32, R(ABI_PARAM3), Imm8(1)); \
+        J_CC(CC_NZ, repeat);
+
+/*
+    ABI_PARAM1 address
+    ABI_PARAM2 address where registers are stored
+    ABI_PARAM3 how many values to read/write
+    ABI_PARAM4 code cycles
+
+    Dolphin x64CodeEmitter is my favourite assembler
+ */
+void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
+{
+    const u8* zero = GetCodePtr();
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
+    RET();
+
+    void* res = (void*)GetWritableCodePtr();
+
+    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    J_CC(CC_Z, zero);
+
+    PUSH(ABI_PARAM3);
+    PUSH(ABI_PARAM4); // we need you later
+
+    const u8* repeat = GetCodePtr();
+
+    if (preinc)
+        ADD(32, R(ABI_PARAM1), Imm8(4));
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+    FixupBranch insideDTCM = J_CC(CC_B);
+
+    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+    FixupBranch insideITCM = J_CC(CC_B);
+
+    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster
+
+    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+    AND(32, R(ABI_PARAM1), Imm8(~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM2), currentElement);
+        CALL((void*)NDS::ARM9Write32);
+    }
+    else
+        CALL((void*)NDS::ARM9Read32);
+    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(12));
+    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
+    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
+
+    FixupBranch finishIt1 = J();
+
+    SetJumpTarget(insideDTCM);
+    AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM4), currentElement);
+        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4));
+    }
+    else
+        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
+    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
+    FixupBranch finishIt2 = J();
+
+    SetJumpTarget(insideITCM);
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM4), currentElement);
+        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
+        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
+        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
+        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+    }
+    else
+        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), Imm32(1));
+    MOV(32, R(ABI_PARAM2), Imm32(1));
+
+    SetJumpTarget(finishIt1);
+    SetJumpTarget(finishIt2);
+
+    POP(ABI_PARAM4);
+    POP(ABI_PARAM3);
+
+    CMP(32, R(ABI_PARAM3), Imm8(1));
+    FixupBranch skipSequential = J_CC(CC_E);
+    SUB(32, R(ABI_PARAM3), Imm8(1));
+    IMUL(32, R(ABI_PARAM3));
+    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
+    SetJumpTarget(skipSequential);
+
+    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
+{
+    const u8* zero = GetCodePtr();
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
+    RET();
+
+    void* res = (void*)GetWritableCodePtr();
+
+    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    J_CC(CC_Z, zero);
+
+    PUSH(ABI_PARAM3);
+    PUSH(ABI_PARAM4); // we need you later
+
+    const u8* repeat = GetCodePtr();
+
+    if (preinc)
+        ADD(32, R(ABI_PARAM1), Imm8(4));
+
+    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8);
+
+    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+    AND(32, R(ABI_PARAM1), Imm8(~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM2), currentElement);
+        CALL((void*)NDS::ARM7Write32);
+    }
+    else
+        CALL((void*)NDS::ARM7Read32);
+    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(15));
+    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
+    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+
+    POP(ABI_PARAM4);
+    POP(ABI_PARAM3);
+
+    CMP(32, R(ABI_PARAM3), Imm8(1));
+    FixupBranch skipSequential = J_CC(CC_E);
+    SUB(32, R(ABI_PARAM3), Imm8(1));
+    IMUL(32, R(ABI_PARAM3));
+    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
+    SetJumpTarget(skipSequential);
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0xFF000000));
+    CMP(32, R(RSCRATCH), Imm32(0x02000000));
+    FixupBranch outsideMainRAM = J_CC(CC_NE);
+    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    SetJumpTarget(outsideMainRAM);
+    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    return res;
+}
+
+#undef CALC_CYCLES_9
+#undef MEMORY_SEQ_WHILE_COND
+
+void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 {
     if (store)
         MOV(32, R(ABI_PARAM2), rd);
@@ -278,6 +458,129 @@ void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int si
     }
 }
 
+s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    int regsCount = regs.Count();
+
+    const u8 userModeOffsets[] =
+    {
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0,
+
+        offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]),
+        offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0,
+    };
+
+    if (decrement)
+    {
+        MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
+        preinc = !preinc;
+    }
+    else
+        MOV(32, R(ABI_PARAM1), rb);
+
+    MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        u32 cycles = Num
+            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+    MOV(32, R(ABI_PARAM4), Imm32(cycles));
+    if (!store)
+    {
+        SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        MOV(64, R(ABI_PARAM2), R(RSP));
+
+        CALL(Num == 0
+            ? MemoryFuncsSeq9[0][preinc]
+            : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
+
+        for (int reg = 15; reg >= 0; reg--)
+        {
+            if (regs[reg])
+            {
+                if (usermode && reg >= 8 && reg < 15)
+                {
+                    MOV(32, R(RSCRATCH2), R(RCPSR));
+                    AND(32, R(RSCRATCH2), Imm8(0x1F));
+                    // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
+                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                    POP(RSCRATCH);
+                    MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
+                }
+                else if (RegCache.Mapping[reg] == INVALID_REG)
+                {
+                    assert(reg != 15);
+
+                    POP(RSCRATCH);
+                    SaveReg(reg, RSCRATCH);
+                }
+                else
+                {
+                    if (reg != 15)
+                        RegCache.DirtyRegs |= (1 << reg);
+                    POP(MapReg(reg).GetSimpleReg());
+                }
+            }
+        }
+
+        if (regs[15])
+        {
+            if (Num == 1)
+                OR(32, MapReg(15), Imm8(1));
+            Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+        }
+    }
+    else
+    {
+        for (int reg : regs)
+        {
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                MOV(32, R(RSCRATCH), R(RCPSR));
+                AND(32, R(RSCRATCH), Imm8(0x1F));
+                // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
+                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
+                PUSH(RSCRATCH);
+            }
+            else if (RegCache.Mapping[reg] == INVALID_REG)
+            {
+                LoadReg(reg, RSCRATCH);
+                PUSH(RSCRATCH);
+            }
+            else
+                PUSH(MapReg(reg).GetSimpleReg());
+        }
+        MOV(64, R(ABI_PARAM2), R(RSP));
+
+        CALL(Num == 0
+            ? MemoryFuncsSeq9[1][preinc]
+            : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
+
+        ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+    }
+
+    return (regsCount * 4) * (decrement ? -1 : 1);
+}
+
 OpArg Compiler::A_Comp_GetMemWBOffset()
 {
     if (!(CurInstr.Instr & (1 << 25)))
@@ -354,6 +657,25 @@ void Compiler::A_Comp_MemHalf()
         ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
         : MapReg(CurInstr.A_Reg(0));
 
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    bool load = CurInstr.Instr & (1 << 20);
+
+    bool signExtend = false;
+    int size;
+    if (!load)
+    {
+        size = op == 1 ? 16 : 32;
+        load = op == 2;
+    }
+    else if (load)
+    {
+        size = op == 2 ? 8 : 16;
+        signExtend = op > 1;
+    }
+
+    if (size == 32 && Num == 1)
+        return; // NOP
+
     if (CurInstr.Instr & (1 << 24))
     {
         if (CurInstr.Instr & (1 << 23))
@@ -370,19 +692,6 @@ void Compiler::A_Comp_MemHalf()
     else
         MOV(32, R(ABI_PARAM1), rn);
 
-    int op = (CurInstr.Instr >> 5) & 0x3;
-    bool load = CurInstr.Instr & (1 << 20);
-
-    bool signExtend = false;
-    int size;
-    if (!load && op == 1)
-        size = 16;
-    else if (load)
-    {
-        size = op == 2 ? 8 : 16;
-        signExtend = op > 1;
-    }
-
     if (!(CurInstr.Instr & (1 << 24)))
     {
         if (CurInstr.Instr & (1 << 23))
@@ -412,6 +721,24 @@ void Compiler::T_Comp_MemReg()
     Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
 }
 
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = (CurInstr.Instr >> 20) & 1;
+    bool pre = (CurInstr.Instr >> 24) & 1;
+    bool add = (CurInstr.Instr >> 23) & 1;
+    bool writeback = (CurInstr.Instr >> 21) & 1;
+    bool usermode = (CurInstr.Instr >> 22) & 1;
+
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false);
+
+    if (writeback)
+        ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
+}
+
 void Compiler::T_Comp_MemImm()
 {
     OpArg rd = MapReg(CurInstr.T_Reg(0));
@@ -456,4 +783,56 @@ void Compiler::T_Comp_MemImmHalf()
     Comp_MemAccess(rd, false, !load, 16);
 }
 
+void Compiler::T_Comp_LoadPCRel()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+
+    // hopefully this doesn't break
+    u32 val; CurCPU->DataRead32(addr, &val);
+    MOV(32, rd, Imm32(val));
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+
+    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
+
+    Comp_MemAccess(rd, false, !load, 32);
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    OpArg sp = MapReg(13);
+    
+    s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false);
+
+    ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    OpArg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+
+    s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+        ADD(32, rb, Imm8(offset));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 32a9645..c519229 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -25,9 +25,7 @@ enum {
 
     A_Link              = 1 << 10,
 
-    A_LDMSTM            = 1 << 11,
-
-    A_ARM9Only          = 1 << 12,
+    A_UnkOnARM7         = 1 << 11,
 };
 
 #define A_BIOP A_Read16
@@ -97,12 +95,12 @@ const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
 const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
 
-const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ);
+const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ);
 
-const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD);
-const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB);
-const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD);
-const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB);
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
 #define A_STR A_Read12
@@ -144,8 +142,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
 const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
 
-const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM);
+const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -154,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
 const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
 
 const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
-const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM);
-const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG);
-const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS);
-const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR);
-const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC);
+const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC);
 const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
@@ -249,7 +247,7 @@ const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
@@ -320,8 +318,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
 
-        if (data & A_ARM9Only && num != 0)
-            data |= A_BranchAlways | A_Link;
+        if (data & A_UnkOnARM7 && num != 0)
+            data = A_UNK;
+
+        res.Kind = (data >> 13) & 0x1FF;
 
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
@@ -360,14 +360,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SrcRegs |= 1 << 15;
         }
 
-        if (data & A_LDMSTM)
-        {
-            res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15);
-            if (instr & (1 << 21))
-                res.DstRegs |= 1 << ((instr >> 16) & 0xF);
-        }
-
-        res.Kind = (data >> 13) & 0x1FF;
+        if (res.Kind == ak_LDM)
+            res.DstRegs |= instr & (1 << 15); // this is right
 
         return res;
     }
-- 
cgit v1.2.3


From c58fdbd66bab9f1b97e9522afa5436f212540b6d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 11 Jul 2019 16:22:47 +0200
Subject: jit: branch instructions

---
 src/ARM.cpp                         |  12 +-
 src/ARMJIT.cpp                      |   4 +-
 src/ARMJIT.h                        |   2 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 267 ++++++++++++++++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 185 ++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  30 ++--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  42 +-----
 src/ARM_InstrInfo.cpp               |   6 +-
 src/ARM_InstrInfo.h                 |   1 +
 src/CMakeLists.txt                  |   1 +
 10 files changed, 363 insertions(+), 187 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_Branch.cpp

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index f7ca26d..aca876d 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -521,11 +521,8 @@ void ARMv5::Execute()
             printf("aaarg ungempappter raum %x\n", R[15]);*/
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
-        if (block == NULL)
-            ARMJIT::CompileBlock(this);
-        else
-            Cycles += block();
-
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -607,10 +604,7 @@ void ARMv4::Execute()
             printf("aaarg ungempappter raum %x\n", R[15]);*/
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
-        if (block == NULL)
-            ARMJIT::CompileBlock(this);
-        else
-            Cycles += block();
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 6afa967..47b425f 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -121,7 +121,7 @@ void DeInit()
 	delete compiler;
 }
 
-void CompileBlock(ARM* cpu)
+CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -171,6 +171,8 @@ void CompileBlock(ARM* cpu)
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
     InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
+
+	return block;
 }
 
 void ResetBlocks()
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 71188f9..45bb4ed 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 void Init();
 void DeInit();
 
-void CompileBlock(ARM* cpu);
+CompiledBlock CompileBlock(ARM* cpu);
 
 void ResetBlocks();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..fb2acba
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -0,0 +1,267 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+    
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    // it's not completely safe to assume stuff like, which instructions to preload
+    // we'll see how it works out
+
+    u32 newPC;
+    u32 nextInstr[2];
+    u32 cycles = 0;
+    bool setupRegion = false;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        OR(32, R(RCPSR), Imm8(0x20));
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        AND(32, R(RCPSR), Imm32(~0x20));
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 oldregion = R15 >> 24;
+        u32 newregion = addr >> 24;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
+
+        setupRegion = newregion != oldregion;
+        if (setupRegion)
+            cpu9->SetupCodeMem(addr);
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
+                cycles += CurCPU->CodeCycles;
+                nextInstr[1] = cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                nextInstr[0] = cpu9->CodeRead32(addr, true);
+                nextInstr[1] = nextInstr[0] >> 16;
+                cycles += CurCPU->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            nextInstr[0] = cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            nextInstr[1] = cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
+            nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            nextInstr[0] = cpu7->CodeRead32(addr);
+            nextInstr[1] = cpu7->CodeRead32(addr+4);
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+        }
+    }
+
+    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0]));
+    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1]));
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+
+    if (setupRegion)
+    {
+        MOV(32, R(ABI_PARAM1), R(RCPU));
+        MOV(32, R(ABI_PARAM2), Imm32(newPC));
+        CALL((void*)&ARMv5::SetupCodeMem);
+    }
+}
+
+void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
+{
+    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000);
+    bool previouslyDirty = CPSRDirty;
+    SaveCPSR();
+
+    if (restoreCPSR)
+    {
+        if (Thumb || CurInstr.Cond() >= 0xE)
+        {
+            for (int reg : hiRegsLoaded)
+                RegCache.UnloadRegister(reg);
+        }
+        else
+        {
+            // the ugly way...
+            // we only save them, to load and save them again
+            for (int reg : hiRegsLoaded)
+                SaveReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+
+    MOV(64, R(ABI_PARAM1), R(RCPU));
+    MOV(32, R(ABI_PARAM2), R(addr));
+    if (!restoreCPSR)
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    else
+        MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+    if (Num == 0)
+        CALL((void*)&ARMv5::JumpTo);
+    else
+        CALL((void*)&ARMv4::JumpTo);
+    
+    if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
+    {
+        for (int reg : hiRegsLoaded)
+            LoadReg(reg, RegCache.Mapping[reg]);
+    }
+
+    if (previouslyDirty)
+        LoadCPSR();
+    CPSRDirty = previouslyDirty;
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(0));
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+    Comp_JumpTo(rn.GetSimpleReg());
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    FixupBranch skipFailed = J();
+    SetJumpTarget(skipExecute);
+    Comp_AddCycles_C(true);
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+    if (link && Num == 1)
+    {
+        printf("BLX unsupported on ARM7!!!\n");
+        return;
+    }
+
+    OpArg rn = MapReg(CurInstr.A_Reg(3));
+    if (link)
+        MOV(32, MapReg(14), Imm32(R15 - 1));
+    Comp_JumpTo(rn.GetSimpleReg());
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOV(32, MapReg(14), Imm32(R15 + offset));
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    OpArg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    LEA(32, RSCRATCH, MDisp(lr.GetSimpleReg(), offset));
+    MOV(32, lr, Imm32((R15 - 2) | 1));
+    if (Num == 1 || CurInstr.Instr & (1 << 12))
+        OR(32, R(RSCRATCH), Imm8(1));
+    Comp_JumpTo(RSCRATCH);
+}
+
+void Compiler::T_Comp_BL_Merged(FetchedInstr part1)
+{
+    assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1);
+    Comp_AddCycles_C();
+
+    u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9);
+    target += (CurInstr.Instr & 0x7FF) << 1;
+
+    if (Num == 1 || CurInstr.Instr & (1 << 12))
+        target |= 1;
+
+    MOV(32, MapReg(14), Imm32((R15 - 2) | 1));
+    
+    Comp_JumpTo(target);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 4fe0c70..6799a90 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -50,50 +50,6 @@ Compiler::Compiler()
     ResetStart = GetWritableCodePtr();
 }
 
-void* Compiler::Gen_ChangeCPSRRoutine()
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(RCPSR));
-    AND(32, R(RSCRATCH), Imm8(0x1F));
-    CMP(32, R(RSCRATCH), Imm8(0x11));
-    FixupBranch fiq = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x12));
-    FixupBranch irq = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x13));
-    FixupBranch svc = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x17));
-    FixupBranch abt = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x1B));
-    FixupBranch und = J_CC(CC_E);
-
-    SetJumpTarget(fiq);
-
-    SetJumpTarget(irq);
-
-    SetJumpTarget(svc);
-
-    SetJumpTarget(abt);
-
-    SetJumpTarget(und);
-
-    return res;
-}
-
-DataRegion Compiler::ClassifyAddress(u32 addr)
-{
-    if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
-        return dataRegionDTCM;
-    switch (addr & 0xFF000000)
-    {
-        case 0x02000000: return dataRegionMainRAM;
-        case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM;
-        case 0x04000000: return dataRegionIO;
-        case 0x06000000: return dataRegionVRAM;
-    }
-    return dataRegionGeneric;
-}
-
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -123,6 +79,29 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
     MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
 }
 
+// invalidates RSCRATCH and RSCRATCH3
+Gen::FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    if (cond >= 0x8)
+    {
+        static_assert(RSCRATCH3 == ECX);
+        MOV(32, R(RSCRATCH3), R(RCPSR));
+        SHR(32, R(RSCRATCH3), Imm8(28));
+        MOV(32, R(RSCRATCH), Imm32(1));
+        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+
+        return J_CC(CC_Z);
+    }
+    else
+    {
+        // could have used a LUT, but then where would be the fun?
+        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
+
+        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+    }
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -140,6 +119,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
+    bool mergedThumbBL = false;
+
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
@@ -167,17 +148,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
             }
 
-            if (comp == NULL || CurInstr.Info.Branches())
+            if (comp == NULL)
                 SaveCPSR();
         }
-
-        // run interpreter
-        cpu->CodeCycles = CurInstr.CodeCycles;
-        cpu->R[15] = R15;
-        cpu->CurInstr = CurInstr.Instr;
-        cpu->NextInstr[0] = CurInstr.NextInstr[0];
-        cpu->NextInstr[1] = CurInstr.NextInstr[1];
-
+        
         if (comp != NULL)
             RegCache.Prepare(i);
         else
@@ -185,58 +159,44 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
-            if (comp == NULL)
+            if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1
+                && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2)
+                mergedThumbBL = true;
+            else
             {
-                MOV(64, R(ABI_PARAM1), R(RCPU));
+                u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                    ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                }
+                else if (mergedThumbBL)
+                    T_Comp_BL_Merged(instrs[i - 1]);
+                else
+                    (this->*comp)();
             }
-            else
-                (this->*comp)();
-
-            ARMInterpreter::THUMBInstrTable[icode](cpu);
         }
         else
         {
             u32 cond = CurInstr.Cond();
             if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
             {
-                MOV(64, R(ABI_PARAM1), R(RCPU));
-                ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
-
-                ARMInterpreter::A_BLX_IMM(cpu);
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+                    ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+                }
             }
             else if (cond == 0xF)
-            {
                 Comp_AddCycles_C();
-                cpu->AddCycles_C();
-            }
             else
             {
                 FixupBranch skipExecute;
                 if (cond < 0xE)
-                {
-                    if (cond >= 0x8)
-                    {
-                        static_assert(RSCRATCH3 == ECX);
-                        MOV(32, R(RSCRATCH3), R(RCPSR));
-                        SHR(32, R(RSCRATCH3), Imm8(28));
-                        MOV(32, R(RSCRATCH), Imm32(1));
-                        SHL(32, R(RSCRATCH), R(RSCRATCH3));
-                        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
-
-                        skipExecute = J_CC(CC_Z);
-                    }
-                    else
-                    {
-                        // could have used a LUT, but then where would be the fun?
-                        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
-
-                        skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z);
-                    }
-
-                }
+                    skipExecute = CheckCondition(cond);
 
                 u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
@@ -258,19 +218,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                     SetJumpTarget(skipFailed);
                 }
-
-                if (cpu->CheckCondition(cond))
-                    ARMInterpreter::ARMInstrTable[icode](cpu);
-                else
-                    cpu->AddCycles_C();
             }
         }
 
-        /*
-            we don't need to collect the interpreted cycles,
-            since cpu->Cycles is taken into account by the dispatcher.
-        */
-
         if (comp == NULL && i != instrsCount - 1)
             LoadCPSR();
     }
@@ -367,7 +317,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // LDM/STM
         NULL, NULL,
         // Branch
-        NULL, NULL, NULL, NULL, NULL,
+        A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
         // system stuff
         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
@@ -389,7 +339,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // pc/sp relative
         T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
         // LDR pcrel
-        NULL,
+        T_Comp_LoadPCRel,
         // LDR/STR reg offset
         T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
         // LDR/STR sign extended, half
@@ -399,25 +349,27 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // LDR/STR half imm offset
         T_Comp_MemImmHalf, T_Comp_MemImmHalf,
         // LDR/STR sp rel
-        NULL, NULL,
+        T_Comp_MemSPRel, T_Comp_MemSPRel,
         // PUSH/POP
-        NULL, NULL, 
+        T_Comp_PUSH_POP, T_Comp_PUSH_POP, 
         // LDMIA, STMIA
-        NULL, NULL, 
-        NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL
+        T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, 
+        // Branch
+        T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, 
+        // Unk, SVC
+        NULL, NULL
     };
 
     return Thumb ? T_Comp[kind] : A_Comp[kind];
 }
 
-void Compiler::Comp_AddCycles_C()
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (CurInstr.Cond() < 0xE)
+    if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
@@ -429,25 +381,10 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
-    if (CurInstr.Cond() < 0xE)
+    if (!Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
-void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
-{
-    // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert
-    // wird der alte Wert gespeichert
-    SaveCPSR();
-
-    MOV(64, R(ABI_PARAM1), R(RCPU));
-    MOV(32, R(ABI_PARAM2), R(addr));
-    MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
-    if (Num == 0)
-        CALL((void*)&ARMv5::JumpTo);
-    else
-        CALL((void*)&ARMv4::JumpTo);
-}
-
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a751737..45b488a 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -22,19 +22,6 @@ class Compiler;
 
 typedef void (Compiler::*CompileFunc)();
 
-enum DataRegion
-{
-    dataRegionGeneric, // hey, that's me!
-    dataRegionMainRAM,
-    dataRegionSWRAM,
-    dataRegionVRAM,
-    dataRegionIO,
-    dataRegionExclusive,
-    dataRegionsCount,
-    dataRegionDTCM = dataRegionExclusive,
-    dataRegionWRAM7 = dataRegionExclusive,
-};
-
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -49,8 +36,9 @@ private:
     CompileFunc GetCompFunc(int kind);
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
 
-    void Comp_AddCycles_C();
+    void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
 
     enum
@@ -63,8 +51,6 @@ private:
         opInvertOp2 = 1 << 5,
     };
 
-    DataRegion ClassifyAddress(u32 addr);
-
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
@@ -73,6 +59,9 @@ private:
     void A_Comp_MemHalf();
     void A_Comp_LDM_STM();
 
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
@@ -91,6 +80,13 @@ private:
     void T_Comp_PUSH_POP();
     void T_Comp_LDMIA_STMIA();
 
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged(FetchedInstr prefix);
+
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
     s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
@@ -119,6 +115,8 @@ private:
     void LoadCPSR();
     void SaveCPSR();
 
+    Gen::FixupBranch CheckCondition(u32 cond);
+
     Gen::OpArg MapReg(int reg)
     {
         if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 20e1893..69b324c 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -462,38 +462,10 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
 {
     int regsCount = regs.Count();
 
-    const u8 userModeOffsets[] =
-    {
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0,
-
-        offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]),
-        offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0,
-
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0,
-
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0,
-    };
-
     if (decrement)
     {
         MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
-        preinc = !preinc;
+        preinc ^= true;
     }
     else
         MOV(32, R(ABI_PARAM1), rb);
@@ -516,16 +488,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         {
             if (regs[reg])
             {
-                if (usermode && reg >= 8 && reg < 15)
+                /*if (usermode && reg >= 8 && reg < 15)
                 {
                     MOV(32, R(RSCRATCH2), R(RCPSR));
                     AND(32, R(RSCRATCH2), Imm8(0x1F));
                     // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
                     POP(RSCRATCH);
                     MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
                 }
-                else if (RegCache.Mapping[reg] == INVALID_REG)
+                else */if (RegCache.Mapping[reg] == INVALID_REG)
                 {
                     assert(reg != 15);
 
@@ -552,16 +524,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
     {
         for (int reg : regs)
         {
-            if (usermode && reg >= 8 && reg < 15)
+            /*if (usermode && reg >= 8 && reg < 15)
             {
                 MOV(32, R(RSCRATCH), R(RCPSR));
                 AND(32, R(RSCRATCH), Imm8(0x1F));
                 // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
                 MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
                 PUSH(RSCRATCH);
             }
-            else if (RegCache.Mapping[reg] == INVALID_REG)
+            else */if (RegCache.Mapping[reg] == INVALID_REG)
             {
                 LoadReg(reg, RSCRATCH);
                 PUSH(RSCRATCH);
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index c519229..b8dff00 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -255,7 +255,7 @@ const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
-const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
 const u32 T_B = T_BranchAlways | tk(tk_B);
 const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
 const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
@@ -301,6 +301,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.DstRegs |= (1 << 13);
         if (data & T_ReadR15)
             res.SrcRegs |= (1 << 15);
+        if (data & T_WriteR14)
+            res.DstRegs |= (1 << 14);
+        if (data & T_ReadR14)
+            res.SrcRegs |= (1 << 14);
 
         if (data & T_BranchAlways)
             res.DstRegs |= (1 << 15);
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index dcd938b..51dcfa2 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -202,6 +202,7 @@ enum
     tk_POP,
     tk_LDMIA,
     tk_STMIA,
+    
     tk_BCOND,
     tk_BX,
     tk_BLX_REG,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 662ed5c..9401220 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(core STATIC
 	ARMJIT_x64/ARMJIT_Compiler.cpp
 	ARMJIT_x64/ARMJIT_ALU.cpp
 	ARMJIT_x64/ARMJIT_LoadStore.cpp
+	ARMJIT_x64/ARMJIT_Branch.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
-- 
cgit v1.2.3


From 2efab201e936ab0f60baf1de8e957080141d2d93 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 03:43:45 +0200
Subject: jit: LDM/STM finally(!) working + MUL, MLA and CLZ

---
 src/ARM.cpp                         |   7 +++
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  74 +++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |   7 +--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 108 +++++++++++++++++++++++++++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  14 ++++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 116 +++++++++++++++++++++++++-----------
 6 files changed, 279 insertions(+), 47 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index aca876d..a77fbc4 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -80,8 +80,15 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
+namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
+
 void ARM::Reset()
 {
+    FILE* blabla = fopen("fhhg", "w");
+    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
+        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
+    fclose(blabla);
+
     Cycles = 0;
     Halted = 0;
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index c22751e..cbe67fd 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -223,6 +223,73 @@ void Compiler::A_Comp_MovOp()
         Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
+void Compiler::A_Comp_CLZ()
+{
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+
+    MOV(32, R(RSCRATCH), Imm32(32));
+    TEST(32, rm, rm);
+    FixupBranch skipZero = J_CC(CC_Z);
+    BSR(32, RSCRATCH, rm);
+    XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH
+    SetJumpTarget(skipZero);
+    MOV(32, rd, R(RSCRATCH));
+}
+
+void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn)
+{
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        BSR(32, RSCRATCH2, R(RSCRATCH3));
+        NOT(32, R(RSCRATCH3));
+        BSR(32, RSCRATCH, R(RSCRATCH3));
+        CMP(32, R(RSCRATCH2), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1);
+    }
+
+    static_assert(EAX == RSCRATCH);
+    MOV(32, R(RSCRATCH), rm);
+    if (add)
+    {
+        IMUL(32, RSCRATCH, rs);
+        LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg()));
+        TEST(32, rd, rd);
+    }
+    else
+    {
+        IMUL(32, RSCRATCH, rs);
+        MOV(32, rd, R(RSCRATCH));
+        TEST(32, R(RSCRATCH), R(RSCRATCH));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+}
+
+void Compiler::A_Comp_MUL_MLA()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn;
+    if (add)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_MulOp(S, add, rd, rm, rs, rn);
+}
+
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
     CPSRDirty = true;
@@ -455,6 +522,13 @@ void Compiler::T_Comp_ALU_Imm8()
     }
 }
 
+void Compiler::T_Comp_MUL()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+    Comp_MulOp(true, false, rd, rd, rs, Imm8(-1));
+}
+
 void Compiler::T_Comp_ALU()
 {
     OpArg rd = MapReg(CurInstr.T_Reg(0));
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index fb2acba..bd01ffb 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -126,17 +126,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
-    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000);
+    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
     {
         if (Thumb || CurInstr.Cond() >= 0xE)
-        {
-            for (int reg : hiRegsLoaded)
-                RegCache.UnloadRegister(reg);
-        }
+            RegCache.Flush();
         else
         {
             // the ugly way...
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 6799a90..8a895d1 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -26,10 +26,14 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+int instructionPopularityARM[ARMInstrInfo::ak_Count];
+
 Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 16);
 
+    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
+
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -47,7 +51,88 @@ Compiler::Compiler()
             MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
         }
 
-    ResetStart = GetWritableCodePtr();
+    {
+        // RSCRATCH mode
+        // ABI_PARAM2 reg number
+        // ABI_PARAM3 value in current mode
+        // ret - ABI_PARAM3
+        ReadBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)));
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)));
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)));
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)));
+        RET();
+        SetJumpTarget(und);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
+        RET();
+        }
+    {
+        // RSCRATCH  mode
+        // ABI_PARAM2 reg n
+        // ABI_PARAM3 value
+        // carry flag set if the register isn't banked
+        WriteBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        STC();
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(und);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3));
+        CLC();
+        RET();
+    }
+
+    ResetStart = (void*)GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -136,6 +221,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
 
         CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
+        
+        if (!Thumb)
+            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
 
         if (comp == NULL || i == instrsCount - 1)
         {
@@ -287,9 +375,9 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // Mul
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
-        NULL, NULL, NULL, NULL, NULL,
+        A_Comp_CLZ, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -315,7 +403,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // swap
         NULL, NULL,
         // LDM/STM
-        NULL, NULL,
+        A_Comp_LDM_STM, A_Comp_LDM_STM,
         // Branch
         A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
         // system stuff
@@ -333,7 +421,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU,
         // hi reg
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
@@ -387,4 +475,14 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         ConstantCycles += cycles;
 }
 
+void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+    
+    LEA(32, RSCRATCH, MDisp(i, add + cycles));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 45b488a..89dfe28 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -40,6 +40,7 @@ private:
 
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
+    void Comp_AddCycles_CI(Gen::X64Reg i, int add);
 
     enum
     {
@@ -55,6 +56,10 @@ private:
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
+    void A_Comp_MUL_MLA();
+
+    void A_Comp_CLZ();
+    
     void A_Comp_MemWB();
     void A_Comp_MemHalf();
     void A_Comp_LDM_STM();
@@ -62,11 +67,13 @@ private:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
+    void T_Comp_MUL();
 
     void T_Comp_RelAddr();
     void T_Comp_AddSP();
@@ -88,7 +95,7 @@ private:
     void T_Comp_BL_Merged(FetchedInstr prefix);
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
-    s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -96,6 +103,8 @@ private:
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
     void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed);
 
+    void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn);
+
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
     void* Gen_MemoryRoutine9(bool store, int size);
@@ -133,6 +142,9 @@ private:
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
 
+    void* ReadBanked;
+    void* WriteBanked;
+
     bool CPSRDirty = false;
 
     FetchedInstr CurInstr;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 69b324c..8fbcafd 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,7 +1,5 @@
 #include "ARMJIT_Compiler.h"
 
-#include "../GPU.h"
-#include "../Wifi.h"
 
 using namespace Gen;
 
@@ -362,7 +360,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     CMP(32, R(ABI_PARAM3), Imm8(1));
     FixupBranch skipSequential = J_CC(CC_E);
     SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, R(ABI_PARAM3));
+    IMUL(32, RSCRATCH, R(ABI_PARAM3));
     ADD(32, R(ABI_PARAM2), R(RSCRATCH));
     SetJumpTarget(skipSequential);
 
@@ -413,10 +411,11 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     POP(ABI_PARAM4);
     POP(ABI_PARAM3);
 
+    // TODO: optimise this
     CMP(32, R(ABI_PARAM3), Imm8(1));
     FixupBranch skipSequential = J_CC(CC_E);
     SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, R(ABI_PARAM3));
+    IMUL(32, RSCRATCH, R(ABI_PARAM3));
     ADD(32, R(ABI_PARAM2), R(RSCRATCH));
     SetJumpTarget(skipSequential);
 
@@ -458,25 +457,35 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
     }
 }
 
-s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+void printStuff2(u32 a, u32 b)
 {
+    printf("b %x %x\n", a, b);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    FILE* f;
+    const u8* start = GetCodePtr();
+
     int regsCount = regs.Count();
 
     if (decrement)
     {
-        MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
+        MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
         preinc ^= true;
     }
     else
-        MOV(32, R(ABI_PARAM1), rb);
+        MOV(32, R(ABI_PARAM1), MapReg(rn));
+
+    s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        u32 cycles = Num
+    u32 cycles = Num
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
@@ -484,20 +493,29 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
             ? MemoryFuncsSeq9[0][preinc]
             : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
 
+        bool firstUserMode = true;
         for (int reg = 15; reg >= 0; reg--)
         {
             if (regs[reg])
             {
-                /*if (usermode && reg >= 8 && reg < 15)
+                if (usermode && reg >= 8 && reg < 15)
                 {
-                    MOV(32, R(RSCRATCH2), R(RCPSR));
-                    AND(32, R(RSCRATCH2), Imm8(0x1F));
-                    // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
-                    POP(RSCRATCH);
-                    MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
+                    if (firstUserMode)
+                    {
+                        MOV(32, R(RSCRATCH), R(RCPSR));
+                        AND(32, R(RSCRATCH), Imm8(0x1F));
+                        firstUserMode = false;
+                    }
+                    MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                    POP(ABI_PARAM3);
+                    CALL(WriteBanked);
+                    FixupBranch sucessfulWritten = J_CC(CC_NC);
+                    if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg))
+                        MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
+                    SaveReg(reg, ABI_PARAM3);
+                    SetJumpTarget(sucessfulWritten);
                 }
-                else */if (RegCache.Mapping[reg] == INVALID_REG)
+                else if (RegCache.Mapping[reg] == INVALID_REG)
                 {
                     assert(reg != 15);
 
@@ -516,32 +534,48 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         if (regs[15])
         {
             if (Num == 1)
-                OR(32, MapReg(15), Imm8(1));
+            {
+                if (Thumb)
+                    OR(32, MapReg(15), Imm8(1));
+                else
+                    AND(32, MapReg(15), Imm8(0xFE));
+            }
             Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
         }
     }
     else
     {
+        bool firstUserMode = true;
         for (int reg : regs)
         {
-            /*if (usermode && reg >= 8 && reg < 15)
+            if (usermode && reg >= 8 && reg < 15)
             {
-                MOV(32, R(RSCRATCH), R(RCPSR));
-                AND(32, R(RSCRATCH), Imm8(0x1F));
-                // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
-                MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
-                PUSH(RSCRATCH);
+                if (firstUserMode)
+                {
+                    MOV(32, R(RSCRATCH), R(RCPSR));
+                    AND(32, R(RSCRATCH), Imm8(0x1F));
+                    firstUserMode = false;
+                }
+                if (RegCache.Mapping[reg] == INVALID_REG)
+                    LoadReg(reg, ABI_PARAM3);
+                else
+                    MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg]));
+                MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                CALL(ReadBanked);
+                PUSH(ABI_PARAM3);
             }
-            else */if (RegCache.Mapping[reg] == INVALID_REG)
+            else if (RegCache.Mapping[reg] == INVALID_REG)
             {
                 LoadReg(reg, RSCRATCH);
                 PUSH(RSCRATCH);
             }
             else
+            {
                 PUSH(MapReg(reg).GetSimpleReg());
+            }
         }
         MOV(64, R(ABI_PARAM2), R(RSP));
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
         CALL(Num == 0
             ? MemoryFuncsSeq9[1][preinc]
@@ -550,7 +584,14 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
     }
 
-    return (regsCount * 4) * (decrement ? -1 : 1);
+    if (usermode && !store)
+    {
+        f= fopen("ldm", "a");
+        fwrite(start, GetCodePtr() - start, 1, f);
+        fclose(f);
+    }
+
+    return offset;
 }
 
 OpArg Compiler::A_Comp_GetMemWBOffset()
@@ -697,16 +738,20 @@ void Compiler::A_Comp_LDM_STM()
 {
     BitSet16 regs(CurInstr.Instr & 0xFFFF);
 
-    bool load = (CurInstr.Instr >> 20) & 1;
-    bool pre = (CurInstr.Instr >> 24) & 1;
-    bool add = (CurInstr.Instr >> 23) & 1;
-    bool writeback = (CurInstr.Instr >> 21) & 1;
-    bool usermode = (CurInstr.Instr >> 22) & 1;
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
 
     OpArg rn = MapReg(CurInstr.A_Reg(16));
 
-    s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false);
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
 
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
     if (writeback)
         ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
 }
@@ -789,8 +834,7 @@ void Compiler::T_Comp_PUSH_POP()
     }
 
     OpArg sp = MapReg(13);
-    
-    s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
 
     ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
 }
@@ -801,7 +845,7 @@ void Compiler::T_Comp_LDMIA_STMIA()
     OpArg rb = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false);
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
 
     if (!load || !regs[CurInstr.T_Reg(8)])
         ADD(32, rb, Imm8(offset));
-- 
cgit v1.2.3


From 9b3c14b58abd987d9eb992b04f1f10ee8a6c91f7 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 16:42:42 +0200
Subject: jit: SMULL and SMLAL

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp      | 56 ++++++++++++++++++++++++++++++++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 +
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index cbe67fd..4afafed 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -290,6 +290,59 @@ void Compiler::A_Comp_MUL_MLA()
     Comp_MulOp(S, add, rd, rm, rs, rn);
 }
 
+void Compiler::A_Comp_SMULL_SMLAL()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn = MapReg(CurInstr.A_Reg(12));
+
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        BSR(32, RSCRATCH2, R(RSCRATCH3));
+        NOT(32, R(RSCRATCH3));
+        BSR(32, RSCRATCH, R(RSCRATCH3));
+        CMP(32, R(RSCRATCH2), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, 2);
+    }
+
+    MOVSX(64, 32, RSCRATCH2, rm);
+    MOVSX(64, 32, RSCRATCH3, rs);
+    if (add)
+    {
+        MOV(32, R(RSCRATCH), rd);
+        SHL(64, R(RSCRATCH), Imm8(32));
+        OR(64, R(RSCRATCH), rn);
+
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH));
+    }
+    else
+    {
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        if (S)
+            TEST(64, R(RSCRATCH2), R(RSCRATCH2));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+
+    MOV(32, rn, R(RSCRATCH2));
+    SHR(64, R(RSCRATCH2), Imm8(32));
+    MOV(32, rd, R(RSCRATCH2));
+}
+
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
     CPSRDirty = true;
@@ -302,9 +355,6 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
     }
 
-    if (carryUsed == 983298)
-        printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr);
-
     SETcc(CC_S, R(RSCRATCH));
     SETcc(CC_Z, R(RSCRATCH3));
     LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 8a895d1..b6dd529 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -375,7 +375,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // Mul
-        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
         A_Comp_CLZ, NULL, NULL, NULL, NULL,
         // STR
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 89dfe28..f9bc227 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -57,6 +57,7 @@ private:
     void A_Comp_CmpOp();
 
     void A_Comp_MUL_MLA();
+    void A_Comp_SMULL_SMLAL();
 
     void A_Comp_CLZ();
     
-- 
cgit v1.2.3


From 6f0dcad4f66d752f777a28e456967e638a0c8a79 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 17:01:10 +0200
Subject: jit: fix wrongly placed const

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index b6dd529..e043f58 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -328,7 +328,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
 {
     // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
     // see ARMInstrInfo.h for the order
-    const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+    CompileFunc const A_Comp[ARMInstrInfo::ak_Count] =
     {
         // AND
         A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
@@ -410,7 +410,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
 
-    const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+    CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = {
         // Shift imm
         T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
         // Three operand ADD/SUB
-- 
cgit v1.2.3


From dcf6e1cad2b38dc4fe0dcbdb789f92e01f802a4a Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 02:37:32 +0200
Subject: jit: fix linux

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  48 +++---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 288 +++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |   8 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  15 +-
 src/dolphin/Log.h                   |  13 +-
 src/dolphin/MemoryUtil.cpp          |  13 +-
 7 files changed, 193 insertions(+), 194 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 4afafed..013f54c 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -154,13 +154,13 @@ void Compiler::A_Comp_Arith()
     switch (op)
     {
     case 0x0: // AND
-        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0x1: // EOR
-        Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0x2: // SUB
-        Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
         break;
     case 0x3: // RSB
         if (op2.IsZero())
@@ -172,25 +172,25 @@ void Compiler::A_Comp_Arith()
                 Comp_RetriveFlags(true, true, false);
         }
         else
-            Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+            Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
         break;
     case 0x4: // ADD
-        Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
         break;
     case 0x5: // ADC
-        Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
+        Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
         break;
     case 0x6: // SBC
-        Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
         break;
     case 0x7: // RSC
-        Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
+        Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
         break;
     case 0xC: // ORR
-        Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0xE: // BIC
-        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
         break;
     default:
         assert("unimplemented");
@@ -392,11 +392,11 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     {
         void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL;
         if (op == 0)
-            shiftOp = SHL;
+            shiftOp = &Compiler::SHL;
         else if (op == 1)
-            shiftOp = SHR;
+            shiftOp = &Compiler::SHR;
         else if (op == 2)
-            shiftOp = SAR;
+            shiftOp = &Compiler::SAR;
 
         CMP(32, R(ECX), Imm8(32));
         FixupBranch lt32 = J_CC(CC_L);
@@ -539,9 +539,9 @@ void Compiler::T_Comp_AddSub_()
     Comp_AddCycles_C();
 
     if (op & 1)
-        Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
-        Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
 }
 
 void Compiler::T_Comp_ALU_Imm8()
@@ -564,10 +564,10 @@ void Compiler::T_Comp_ALU_Imm8()
         Comp_CmpOp(2, rd, imm, false);
         return;
     case 0x2:
-        Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
         return;
     case 0x3:
-        Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
         return;
     }
 }
@@ -594,10 +594,10 @@ void Compiler::T_Comp_ALU()
     switch (op)
     {
     case 0x0: // AND
-        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0x1: // EOR
-        Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0x2:
     case 0x3:
@@ -613,10 +613,10 @@ void Compiler::T_Comp_ALU()
         }
         return;
     case 0x5: // ADC
-        Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
         return;
     case 0x6: // SBC
-        Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
         return;
     case 0x8: // TST
         Comp_CmpOp(0, rd, rs, false);
@@ -634,10 +634,10 @@ void Compiler::T_Comp_ALU()
         Comp_CmpOp(3, rd, rs, false);
         return;
     case 0xC: // ORR
-        Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0xE: // BIC
-        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
         return;
     case 0xF: // MVN
         if (rd != rs)
@@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg()
     switch (op)
     {
     case 0x0: // ADD
-        Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
         break;
     case 0x1: // CMP
         Comp_CmpOp(2, rdMapped, rs, false);
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index bd01ffb..05c8ec6 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -118,7 +118,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
     if (setupRegion)
     {
-        MOV(32, R(ABI_PARAM1), R(RCPU));
+        MOV(64, R(ABI_PARAM1), R(RCPU));
         MOV(32, R(ABI_PARAM2), Imm32(newPC));
         CALL((void*)&ARMv5::SetupCodeMem);
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index e043f58..2b7ccd2 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,6 +4,12 @@
 
 #include <assert.h>
 
+#ifdef _WIN32
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 using namespace Gen;
 
 namespace ARMJIT
@@ -28,9 +34,34 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 
 int instructionPopularityARM[ARMInstrInfo::ak_Count];
 
+/*
+    We'll repurpose this .bss memory
+
+ */
+u8 CodeMemory[1024 * 1024 * 32];
+
 Compiler::Compiler()
 {
-    AllocCodeSpace(1024 * 1024 * 16);
+#ifdef _WIN32
+#else
+    u64 pagesize = sysconf(_SC_PAGE_SIZE);
+#endif
+
+    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
+    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
+
+#ifdef _WIN32
+#else
+    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+#endif
+
+    region = pageAligned;
+    region_size = alignedSize;
+    total_region_size = region_size;
+
+    ClearCodeSpace();
+
+    SetCodePtr(pageAligned);
 
     memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
 
@@ -187,6 +218,124 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
     }
 }
 
+#define F(x) &Compiler::x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // EOR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SUB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADD
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SBC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ORR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MOV
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // BIC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MVN
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // TST
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // TEQ
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMP
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMN
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // Mul
+    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL,
+    // ARMv5 stuff
+    F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
+    // STR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    // LDRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSB
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // swap
+    NULL, NULL,
+    // LDM/STM
+    F(A_Comp_LDM_STM), F(A_Comp_LDM_STM),
+    // Branch
+    F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
+    // system stuff
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+};
+
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+    // Shift imm
+    F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm),
+    // Three operand ADD/SUB
+    F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_),
+    // 8 bit imm
+    F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8),
+    // general ALU
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU),
+    // hi reg
+    F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg),
+    // pc/sp relative
+    F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP),
+    // LDR pcrel
+    F(T_Comp_LoadPCRel),
+    // LDR/STR reg offset
+    F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg),
+    // LDR/STR sign extended, half
+    F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf),
+    // LDR/STR imm offset
+    F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm),
+    // LDR/STR half imm offset
+    F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf),
+    // LDR/STR sp rel
+    F(T_Comp_MemSPRel), F(T_Comp_MemSPRel),
+    // PUSH/POP
+    F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), 
+    // LDMIA, STMIA
+    F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), 
+    // Branch
+    F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
+    // Unk, SVC
+    NULL, NULL
+};
+#undef F
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -206,7 +355,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     bool mergedThumbBL = false;
 
-    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
+    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
 
@@ -220,8 +369,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
-        CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
-        
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
         if (!Thumb)
             instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
 
@@ -318,139 +469,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     MOV(32, R(RAX), Imm32(ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
+    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
     RET();
 
     return res;
 }
 
-CompileFunc Compiler::GetCompFunc(int kind)
-{
-    // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
-    // see ARMInstrInfo.h for the order
-    CompileFunc const A_Comp[ARMInstrInfo::ak_Count] =
-    {
-        // AND
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // EOR
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // SUB
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // RSB
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ADD
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ADC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // SBC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // RSC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ORR
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // MOV
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        // BIC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // MVN
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        // TST
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // TEQ
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // CMP
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // CMN
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // Mul
-        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL,
-        // ARMv5 stuff
-        A_Comp_CLZ, NULL, NULL, NULL, NULL,
-        // STR
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        // STRB
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // LDR
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // LDRB
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // STRH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        // LDRH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRSB
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRSH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // swap
-        NULL, NULL,
-        // LDM/STM
-        A_Comp_LDM_STM, A_Comp_LDM_STM,
-        // Branch
-        A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
-        // system stuff
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-    };
-
-    CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = {
-        // Shift imm
-        T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
-        // Three operand ADD/SUB
-        T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
-        // 8 bit imm
-        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8,
-        // general ALU
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU,
-        // hi reg
-        T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
-        // pc/sp relative
-        T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
-        // LDR pcrel
-        T_Comp_LoadPCRel,
-        // LDR/STR reg offset
-        T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
-        // LDR/STR sign extended, half
-        T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf,
-        // LDR/STR imm offset
-        T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
-        // LDR/STR half imm offset
-        T_Comp_MemImmHalf, T_Comp_MemImmHalf,
-        // LDR/STR sp rel
-        T_Comp_MemSPRel, T_Comp_MemSPRel,
-        // PUSH/POP
-        T_Comp_PUSH_POP, T_Comp_PUSH_POP, 
-        // LDMIA, STMIA
-        T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, 
-        // Branch
-        T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, 
-        // Unk, SVC
-        NULL, NULL
-    };
-
-    return Thumb ? T_Comp[kind] : A_Comp[kind];
-}
-
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index f9bc227..e04f96a 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -6,8 +6,6 @@
 #include "../ARMJIT.h"
 #include "../ARMJIT_RegisterCache.h"
 
-#include <tuple>
-
 namespace ARMJIT
 {
 
@@ -18,9 +16,6 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
-class Compiler;
-
-typedef void (Compiler::*CompileFunc)();
 
 class Compiler : public Gen::X64CodeBlock
 {
@@ -32,8 +27,7 @@ public:
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
 
-private:
-    CompileFunc GetCompFunc(int kind);
+    typedef void (Compiler::*CompileFunc)();
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
     void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 8fbcafd..15a40f8 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -464,9 +464,6 @@ void printStuff2(u32 a, u32 b)
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
-    FILE* f;
-    const u8* start = GetCodePtr();
-
     int regsCount = regs.Count();
 
     if (decrement)
@@ -482,11 +479,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     u32 cycles = Num
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
         CALL(Num == 0
@@ -581,14 +579,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? MemoryFuncsSeq9[1][preinc]
             : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
 
-        ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
-    }
-
-    if (usermode && !store)
-    {
-        f= fopen("ldm", "a");
-        fwrite(start, GetCodePtr() - start, 1, f);
-        fclose(f);
+        ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
     }
 
     return offset;
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
index 21e69a5..a7f4b6a 100644
--- a/src/dolphin/Log.h
+++ b/src/dolphin/Log.h
@@ -4,12 +4,13 @@
 
 #include <stdio.h>
 
-#define PanicAlert(msg) \
-    do \
-    { \
-        printf("%s\n", msg); \
-        Crash(); \
-    } while (false)
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
 
 #define DYNA_REC 0
 
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
index 01cb897..7273a8a 100644
--- a/src/dolphin/MemoryUtil.cpp
+++ b/src/dolphin/MemoryUtil.cpp
@@ -6,15 +6,9 @@
 #include <cstdlib>
 #include <string>
 
-#define PanicAlert(fmt, ...) \
-  do \
-  { \
-    printf(fmt "\n", ## __VA_ARGS__); \
-    abort(); \
-  } while (false)
-
 #include "../types.h"
 #include "CommonFuncs.h"
+#include "Log.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -39,8 +33,6 @@ namespace Common
 
 void* AllocateExecutableMemory(size_t size)
 {
-  printf("c\n");
-
 #if defined(_WIN32)
   void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
 #else
@@ -50,13 +42,10 @@ void* AllocateExecutableMemory(size_t size)
   if (ptr == MAP_FAILED)
     ptr = nullptr;
 #endif
-  printf("a\n");
 
   if (ptr == nullptr)
     PanicAlert("Failed to allocate executable memory");
 
-  printf("b\n");
-
   return ptr;
 }
 
-- 
cgit v1.2.3


From 9d76d63af5d496e232018d6ddf8ee1e55ad440ad Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 04:33:36 +0200
Subject: jit: make everything configurable

---
 src/ARM.cpp                        | 127 ++++++++++++++++++++++++++++++++-----
 src/ARM.h                          |   3 +
 src/ARMJIT.cpp                     |  21 ++++--
 src/ARMJIT.h                       |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  14 ++--
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   2 +
 src/Config.cpp                     |   6 ++
 src/Config.h                       |   3 +
 src/NDS.cpp                        |  26 +++++++-
 src/libui_sdl/DlgEmuSettings.cpp   |  16 +++++
 src/libui_sdl/PlatformConfig.cpp   |   1 +
 src/libui_sdl/main.cpp             |  17 ++---
 12 files changed, 192 insertions(+), 46 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index a77fbc4..6cc80c0 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -489,7 +489,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -522,14 +522,8 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
-        
+        }
+ 
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -554,6 +548,58 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+void ARMv5::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(0))
+        {
+            Halted = 0;
+            if (NDS::IME[0] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM9Timestamp < NDS::ARM9Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(0, instrAddr))
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            printf("ARMv5 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            {
+                NDS::ARM9Timestamp = NDS::ARM9Target;
+            }
+            break;
+        }
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
+
 void ARMv4::Execute()
 {
     if (Halted)
@@ -577,7 +623,7 @@ void ARMv4::Execute()
 
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -605,13 +651,7 @@ void ARMv4::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        }
 
         // TODO optimize this shit!!!
         if (Halted)
@@ -636,3 +676,56 @@ void ARMv4::Execute()
     if (Halted == 2)
         Halted = 0;
 }
+
+void ARMv4::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(1))
+        {
+            Halted = 0;
+            if (NDS::IME[1] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM7Timestamp < NDS::ARM7Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(1, instrAddr))
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            printf("ARMv4 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        // TODO optimize this shit!!!
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            {
+                NDS::ARM7Timestamp = NDS::ARM7Target;
+            }
+            break;
+        }
+
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index b9f5d89..0544301 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,6 +52,7 @@ public:
     }
 
     virtual void Execute() = 0;
+    virtual void ExecuteJIT() = 0;
 
     bool CheckCondition(u32 code)
     {
@@ -151,6 +152,7 @@ public:
     void DataAbort();
 
     void Execute();
+    void ExecuteJIT();
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -269,6 +271,7 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+    void ExecuteJIT();
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 47b425f..e8e6be0 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,8 @@
 
 #include <string.h>
 
+#include "Config.h"
+
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
 namespace ARMJIT
@@ -125,18 +127,21 @@ CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
-    FetchedInstr instrs[12];
+	if (Config::JIT_MaxBlockSize < 1)
+		Config::JIT_MaxBlockSize = 1;
+	if (Config::JIT_MaxBlockSize > 32)
+		Config::JIT_MaxBlockSize = 32;
+
+    FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 r15Initial = cpu->R[15];
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
-    //printf("block %x %d\n", r15, thumb);
     do
     {
         r15 += thumb ? 2 : 4;
 
         instrs[i].Instr = nextInstr[0];
-        //printf("%x %x\n", instrs[i].Instr, r15);
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
 
         if (cpu->Num == 0)
@@ -166,16 +171,16 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
-    } while(!instrs[i - 1].Info.Branches() && i < 10);
+    } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize);
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
+    InsertBlock(cpu->Num, blockAddr, block);
 
 	return block;
 }
 
-void ResetBlocks()
+void InvalidateBlockCache()
 {
 	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
 	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
@@ -185,6 +190,8 @@ void ResetBlocks()
 	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
 	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
 	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+
+	compiler->Reset();
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 45bb4ed..004256c 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -111,7 +111,7 @@ void DeInit();
 
 CompiledBlock CompileBlock(ARM* cpu);
 
-void ResetBlocks();
+void InvalidateBlockCache();
 
 }
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 2b7ccd2..fe23859 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -336,13 +336,15 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 };
 #undef F
 
+void Compiler::Reset()
+{
+    SetCodePtr((u8*)ResetStart);
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
-    {
-        ResetBlocks();
-        SetCodePtr((u8*)ResetStart);
-    }
+        InvalidateBlockCache();
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
@@ -355,7 +357,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     bool mergedThumbBL = false;
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
 
@@ -469,7 +471,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     MOV(32, R(RAX), Imm32(ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
     return res;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index e04f96a..cd58012 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -22,6 +22,8 @@ class Compiler : public Gen::X64CodeBlock
 public:
     Compiler();
 
+    void Reset();
+
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
diff --git a/src/Config.cpp b/src/Config.cpp
index f558ef6..37b701c 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -34,6 +34,9 @@ int Threaded3D;
 int GL_ScaleFactor;
 int GL_Antialias;
 
+bool JIT_Enable = false;
+int JIT_MaxBlockSize = 12;
+
 ConfigEntry ConfigFile[] =
 {
     {"3DRenderer", 0, &_3DRenderer, 1, NULL, 0},
@@ -42,6 +45,9 @@ ConfigEntry ConfigFile[] =
     {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0},
     {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0},
 
+    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+
     {"", -1, NULL, 0, NULL, 0}
 };
 
diff --git a/src/Config.h b/src/Config.h
index 84fd57b..18a7910 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -46,6 +46,9 @@ extern int Threaded3D;
 extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
+extern bool JIT_Enable;
+extern int JIT_MaxBlockSize;
+
 }
 
 #endif // CONFIG_H
diff --git a/src/NDS.cpp b/src/NDS.cpp
index baa5e0d..4b50d9c 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -524,7 +524,7 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
-    ARMJIT::ResetBlocks();
+    ARMJIT::InvalidateBlockCache();
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -741,6 +741,11 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+    if (!file->Saving)
+    {
+        ARMJIT::InvalidateBlockCache();
+    }
+
     return true;
 }
 
@@ -826,6 +831,7 @@ void RunSystem(u64 timestamp)
     }
 }
 
+template <bool EnableJIT>
 u32 RunFrame()
 {
     FrameStartTimestamp = SysTimestamp;
@@ -858,7 +864,10 @@ u32 RunFrame()
         }
         else
         {
-            ARM9->Execute();
+            if (EnableJIT)
+                ARM9->ExecuteJIT();
+            else
+                ARM9->Execute();
         }
 
         RunTimers(0);
@@ -880,7 +889,10 @@ u32 RunFrame()
             }
             else
             {
-                ARM7->Execute();
+                if (EnableJIT)
+                    ARM7->ExecuteJIT();
+                else
+                    ARM7->Execute();
             }
 
             RunTimers(1);
@@ -910,6 +922,14 @@ u32 RunFrame()
     return GPU::TotalScanlines;
 }
 
+u32 RunFrame()
+{
+    if (Config::JIT_Enable)
+        return RunFrame<true>();
+    else
+        return RunFrame<false>();
+}
+
 void Reschedule(u64 target)
 {
     if (CurCPU == 0)
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 0ccaed7..116d2da 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -57,10 +57,20 @@ void OnOk(uiButton* btn, void* blarg)
 {
     Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
 
+    Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled);
+    long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10);
+    if (blockSize < 1)
+        blockSize = 1;
+    if (blockSize > 32)
+        blockSize = 32;
+    Config::JIT_MaxBlockSize = blockSize;
+
     Config::Save();
 
     uiControlDestroy(uiControl(win));
     opened = false;
+
+    ApplyNewSettings(4);
 }
 
 void OnJITStateChanged(uiCheckbox* cb, void* blarg)
@@ -143,6 +153,12 @@ void Open()
 
     uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
 
+    uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable);
+    {
+        char maxBlockSizeStr[10];
+        sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize);
+        uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
+    }
     OnJITStateChanged(cbJITEnabled, NULL);
 
     uiControlShow(uiControl(win));
diff --git a/src/libui_sdl/PlatformConfig.cpp b/src/libui_sdl/PlatformConfig.cpp
index f78b195..b6d1e8d 100644
--- a/src/libui_sdl/PlatformConfig.cpp
+++ b/src/libui_sdl/PlatformConfig.cpp
@@ -64,6 +64,7 @@ char MicWavPath[512];
 
 char LastROMFolder[512];
 
+bool EnableJIT;
 
 ConfigEntry PlatformConfigFile[] =
 {
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index 8e8bf9e..d6809c3 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -48,6 +48,7 @@
 #include "../Wifi.h"
 #include "../Platform.h"
 #include "../Config.h"
+#include "../ARMJIT.h"
 
 #include "../Savestate.h"
 
@@ -2408,19 +2409,11 @@ void ApplyNewSettings(int type)
         GPU3D::InitRenderer(Screen_UseGL);
         if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
     }
-    /*else if (type == 4) // vsync
+    else if (type == 4)
     {
-        if (Screen_UseGL)
-        {
-            uiGLMakeContextCurrent(GLContext);
-            uiGLSetVSync(Config::ScreenVSync);
-            uiGLMakeContextCurrent(NULL);
-        }
-        else
-        {
-            // TODO eventually: VSync for non-GL screen?
-        }
-    }*/
+        if (Config::JIT_Enable)
+            ARMJIT::InvalidateBlockCache();
+    }
 
     EmuRunning = prevstatus;
 }
-- 
cgit v1.2.3


From 360317be8c3744c49f081dded95499f671641809 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 18:08:42 +0200
Subject: jit: remove unnessary files from dolphin

---
 src/dolphin/CodeBlock.h    |  29 +-------
 src/dolphin/MemoryUtil.cpp | 182 ---------------------------------------------
 src/dolphin/MemoryUtil.h   |  22 ------
 3 files changed, 1 insertion(+), 232 deletions(-)
 delete mode 100644 src/dolphin/MemoryUtil.cpp
 delete mode 100644 src/dolphin/MemoryUtil.h

(limited to 'src')

diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 1434297..31a8d93 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -49,15 +49,6 @@ public:
   CodeBlock(CodeBlock&&) = delete;
   CodeBlock& operator=(CodeBlock&&) = delete;
 
-  // Call this before you generate any code.
-  void AllocCodeSpace(size_t size)
-  {
-    region_size = size;
-    total_region_size = size;
-    region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
-    T::SetCodePtr(region);
-  }
-
   // Always clear code space with breakpoints, so that if someone accidentally executes
   // uninitialized, it just breaks into the debugger.
   void ClearCodeSpace()
@@ -66,26 +57,8 @@ public:
     ResetCodePtr();
   }
 
-  // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
-  void FreeCodeSpace()
-  {
-    ASSERT(!m_is_child);
-    Common::FreeMemoryPages(region, total_region_size);
-    region = nullptr;
-    region_size = 0;
-    total_region_size = 0;
-    for (CodeBlock* child : m_children)
-    {
-      child->region = nullptr;
-      child->region_size = 0;
-      child->total_region_size = 0;
-    }
-  }
-
   bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
-  // Cannot currently be undone. Will write protect the entire code region.
-  // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
-  void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
+
   void ResetCodePtr() { T::SetCodePtr(region); }
   size_t GetSpaceLeft() const
   {
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
deleted file mode 100644
index 7273a8a..0000000
--- a/src/dolphin/MemoryUtil.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright 2008 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#include <cstddef>
-#include <cstdlib>
-#include <string>
-
-#include "../types.h"
-#include "CommonFuncs.h"
-#include "Log.h"
-
-#ifdef _WIN32
-#include <windows.h>
-//#include "Common/StringUtil.h"
-#else
-#include <stdio.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#if defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
-#include <sys/sysctl.h>
-#elif defined __HAIKU__
-#include <OS.h>
-#else
-#include <sys/sysinfo.h>
-#endif
-#endif
-
-namespace Common
-{
-// This is purposely not a full wrapper for virtualalloc/mmap, but it
-// provides exactly the primitive operations that Dolphin needs.
-
-void* AllocateExecutableMemory(size_t size)
-{
-#if defined(_WIN32)
-  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
-#else
-  void* ptr =
-      mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
-
-  if (ptr == MAP_FAILED)
-    ptr = nullptr;
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate executable memory");
-
-  return ptr;
-}
-
-void* AllocateMemoryPages(size_t size)
-{
-#ifdef _WIN32
-  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_READWRITE);
-#else
-  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
-
-  if (ptr == MAP_FAILED)
-    ptr = nullptr;
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate raw memory");
-
-  return ptr;
-}
-
-void* AllocateAlignedMemory(size_t size, size_t alignment)
-{
-#ifdef _WIN32
-  void* ptr = _aligned_malloc(size, alignment);
-#else
-  void* ptr = nullptr;
-  if (posix_memalign(&ptr, alignment, size) != 0)
-    ERROR_LOG(MEMMAP, "Failed to allocate aligned memory");
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate aligned memory");
-
-  return ptr;
-}
-
-void FreeMemoryPages(void* ptr, size_t size)
-{
-  if (ptr)
-  {
-#ifdef _WIN32
-    if (!VirtualFree(ptr, 0, MEM_RELEASE))
-      PanicAlert("FreeMemoryPages failed!\nVirtualFree: %s", GetLastErrorString().c_str());
-#else
-    if (munmap(ptr, size) != 0)
-      PanicAlert("FreeMemoryPages failed!\nmunmap: %s", LastStrerrorString().c_str());
-#endif
-  }
-}
-
-void FreeAlignedMemory(void* ptr)
-{
-  if (ptr)
-  {
-#ifdef _WIN32
-    _aligned_free(ptr);
-#else
-    free(ptr);
-#endif
-  }
-}
-
-void ReadProtectMemory(void* ptr, size_t size)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue))
-    PanicAlert("ReadProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size, PROT_NONE) != 0)
-    PanicAlert("ReadProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-#endif
-}
-
-void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, &oldValue))
-    PanicAlert("WriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size, allowExecute ? (PROT_READ | PROT_EXEC) : PROT_READ) != 0)
-    PanicAlert("WriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-#endif
-}
-
-void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldValue))
-    PanicAlert("UnWriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size,
-               allowExecute ? (PROT_READ | PROT_WRITE | PROT_EXEC) : PROT_WRITE | PROT_READ) != 0)
-  {
-    PanicAlert("UnWriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-  }
-#endif
-}
-
-size_t MemPhysical()
-{
-#ifdef _WIN32
-  MEMORYSTATUSEX memInfo;
-  memInfo.dwLength = sizeof(MEMORYSTATUSEX);
-  GlobalMemoryStatusEx(&memInfo);
-  return memInfo.ullTotalPhys;
-#elif defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
-  int mib[2];
-  size_t physical_memory;
-  mib[0] = CTL_HW;
-#ifdef __APPLE__
-  mib[1] = HW_MEMSIZE;
-#elif defined __FreeBSD__
-  mib[1] = HW_REALMEM;
-#elif defined __OpenBSD__
-  mib[1] = HW_PHYSMEM;
-#endif
-  size_t length = sizeof(size_t);
-  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
-  return physical_memory;
-#elif defined __HAIKU__
-  system_info sysinfo;
-  get_system_info(&sysinfo);
-  return static_cast<size_t>(sysinfo.max_pages * B_PAGE_SIZE);
-#else
-  struct sysinfo memInfo;
-  sysinfo(&memInfo);
-  return (size_t)memInfo.totalram * memInfo.mem_unit;
-#endif
-}
-
-}  // namespace Common
diff --git a/src/dolphin/MemoryUtil.h b/src/dolphin/MemoryUtil.h
deleted file mode 100644
index 607b7a8..0000000
--- a/src/dolphin/MemoryUtil.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2008 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <string>
-
-namespace Common
-{
-void* AllocateExecutableMemory(size_t size);
-void* AllocateMemoryPages(size_t size);
-void FreeMemoryPages(void* ptr, size_t size);
-void* AllocateAlignedMemory(size_t size, size_t alignment);
-void FreeAlignedMemory(void* ptr);
-void ReadProtectMemory(void* ptr, size_t size);
-void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
-void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false);
-size_t MemPhysical();
-
-}  // namespace Common
-- 
cgit v1.2.3


From 411fb57c07c732a2b60e3566ae045f8f60eea29d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 19:24:00 +0200
Subject: jit: add compile option

---
 CMakeLists.txt                     | 30 +++++++++++++++++++
 src/ARM.cpp                        | 13 ++++----
 src/ARM.h                          |  6 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 -
 src/CMakeLists.txt                 | 25 +++++++++-------
 src/CP15.cpp                       | 12 ++++++--
 src/Config.cpp                     |  4 +++
 src/Config.h                       |  2 ++
 src/NDS.cpp                        | 26 ++++++++++++++++
 src/dolphin/CodeBlock.h            |  3 --
 src/libui_sdl/DlgEmuSettings.cpp   | 21 +++++++++++--
 src/libui_sdl/main.cpp             |  2 ++
 13 files changed, 151 insertions(+), 55 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 048dd44..d59e19c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,36 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+detect_architecture("__x86_64__" x86_64)
+detect_architecture("__i386__" x86)
+detect_architecture("__arm__" ARM)
+detect_architecture("__aarch64__" ARM64)
+
+if (ARCHITECTURE STREQUAL x86_64)
+	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
+endif()
+
+if (ENABLE_JIT)
+	add_definitions(-DJIT_ENABLED)
+endif()
+
 if (CMAKE_BUILD_TYPE STREQUAL Release)
 	option(ENABLE_LTO "Enable link-time optimization" ON)
 else()
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 6cc80c0..eb58d02 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -80,15 +80,8 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
-namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
-
 void ARM::Reset()
 {
-    FILE* blabla = fopen("fhhg", "w");
-    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
-        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
-    fclose(blabla);
-
     Cycles = 0;
     Halted = 0;
 
@@ -548,6 +541,7 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv5::ExecuteJIT()
 {
     if (Halted)
@@ -599,6 +593,7 @@ void ARMv5::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
+#endif
 
 void ARMv4::Execute()
 {
@@ -677,6 +672,7 @@ void ARMv4::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv4::ExecuteJIT()
 {
     if (Halted)
@@ -728,4 +724,5 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
-}
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index 0544301..ecdf5b4 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,7 +52,9 @@ public:
     }
 
     virtual void Execute() = 0;
+#ifdef ENABLE_JIT
     virtual void ExecuteJIT() = 0;
+#endif
 
     bool CheckCondition(u32 code)
     {
@@ -152,7 +154,9 @@ public:
     void DataAbort();
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -271,7 +275,9 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fe23859..18cb27e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,7 +4,10 @@
 
 #include <assert.h>
 
+#include "../dolphin/CommonFuncs.h"
+
 #ifdef _WIN32
+#include <windows.h>
 #else
 #include <sys/mman.h>
 #include <unistd.h>
@@ -32,8 +35,6 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
-int instructionPopularityARM[ARMInstrInfo::ak_Count];
-
 /*
     We'll repurpose this .bss memory
 
@@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32];
 
 Compiler::Compiler()
 {
-#ifdef _WIN32
-#else
-    u64 pagesize = sysconf(_SC_PAGE_SIZE);
-#endif
-
-    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
-    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
-
-#ifdef _WIN32
-#else
-    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
-#endif
-
-    region = pageAligned;
-    region_size = alignedSize;
-    total_region_size = region_size;
+    {
+    #ifdef _WIN32
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+
+        u64 pageSize = (u64)sysInfo.dwPageSize;
+    #else
+        u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    #endif
+
+        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
+        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;
+
+    #ifdef _WIN32
+        DWORD dummy;
+        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
+    #else
+        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+    #endif
+
+        region = pageAligned;
+        region_size = alignedSize;
+        total_region_size = region_size;
+    }
 
     ClearCodeSpace();
 
-    SetCodePtr(pageAligned);
-
-    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
-
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -118,7 +123,7 @@ Compiler::Compiler()
         SetJumpTarget(und);
         MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
-        }
+    }
     {
         // RSCRATCH  mode
         // ABI_PARAM2 reg n
@@ -163,7 +168,10 @@ Compiler::Compiler()
         RET();
     }
 
-    ResetStart = (void*)GetWritableCodePtr();
+    // move the region forward to prevent overwriting the generated functions
+    region_size -= GetWritableCodePtr() - region;
+    total_region_size = region_size;
+    region = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    SetCodePtr((u8*)ResetStart);
+    ClearCodeSpace();
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
@@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (!Thumb)
-            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
-
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index cd58012..0ce7d8d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -132,7 +132,6 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
-    void* ResetStart;
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9401220..10428aa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,19 +30,22 @@ add_library(core STATIC
 	SPU.cpp
 	Wifi.cpp
 	WifiAP.cpp
+)
 
-	ARMJIT.cpp
-	ARMJIT_x64/ARMJIT_Compiler.cpp
-	ARMJIT_x64/ARMJIT_ALU.cpp
-	ARMJIT_x64/ARMJIT_LoadStore.cpp
-	ARMJIT_x64/ARMJIT_Branch.cpp
+if (ENABLE_JIT)
+	target_sources(core PRIVATE
+		ARMJIT.cpp
+		ARMJIT_x64/ARMJIT_Compiler.cpp
+		ARMJIT_x64/ARMJIT_ALU.cpp
+		ARMJIT_x64/ARMJIT_LoadStore.cpp
+		ARMJIT_x64/ARMJIT_Branch.cpp
 
-	dolphin/CommonFuncs.cpp
-	dolphin/x64ABI.cpp
-	dolphin/x64CPUDetect.cpp
-	dolphin/x64Emitter.cpp
-	dolphin/MemoryUtil.cpp
-)
+		dolphin/CommonFuncs.cpp
+		dolphin/x64ABI.cpp
+		dolphin/x64CPUDetect.cpp
+		dolphin/x64Emitter.cpp
+	)
+endif()
 
 if (WIN32)
 	target_link_libraries(core ole32 comctl32 ws2_32 opengl32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index f232bec..e6e91c3 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -812,7 +812,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -834,7 +836,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -856,8 +860,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -879,8 +885,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
+#ifdef JIT_ENABLED
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/Config.cpp b/src/Config.cpp
index 37b701c..3cff0ed 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -34,8 +34,10 @@ int Threaded3D;
 int GL_ScaleFactor;
 int GL_Antialias;
 
+#ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+#endif
 
 ConfigEntry ConfigFile[] =
 {
@@ -45,8 +47,10 @@ ConfigEntry ConfigFile[] =
     {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0},
     {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0},
 
+#ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+#endif
 
     {"", -1, NULL, 0, NULL, 0}
 };
diff --git a/src/Config.h b/src/Config.h
index 18a7910..c13eae3 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -46,8 +46,10 @@ extern int Threaded3D;
 extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
+#ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+#endif
 
 }
 
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 4b50d9c..62a52aa 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -162,7 +162,9 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+#ifdef JIT_ENABLED
     ARMJIT::Init();
+#endif
 
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
@@ -194,7 +196,9 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+#ifdef JIT_ENABLED
     ARMJIT::DeInit();
+#endif
 
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
@@ -524,7 +528,9 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+#ifdef JIT_ENABLED
     ARMJIT::InvalidateBlockCache();
+#endif
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -741,10 +747,12 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+#ifdef JIT_ENABLED
     if (!file->Saving)
     {
         ARMJIT::InvalidateBlockCache();
     }
+#endif
 
     return true;
 }
@@ -864,9 +872,11 @@ u32 RunFrame()
         }
         else
         {
+#ifdef JIT_ENABLED
             if (EnableJIT)
                 ARM9->ExecuteJIT();
             else
+#endif
                 ARM9->Execute();
         }
 
@@ -889,9 +899,11 @@ u32 RunFrame()
             }
             else
             {
+#ifdef JIT_ENABLED
                 if (EnableJIT)
                     ARM7->ExecuteJIT();
                 else
+#endif
                     ARM7->Execute();
             }
 
@@ -924,9 +936,11 @@ u32 RunFrame()
 
 u32 RunFrame()
 {
+#ifdef JIT_ENABLED
     if (Config::JIT_Enable)
         return RunFrame<true>();
     else
+#endif
         return RunFrame<false>();
 }
 
@@ -1849,7 +1863,9 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -1901,7 +1917,9 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -1969,7 +1987,9 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2264,7 +2284,9 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2325,7 +2347,9 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2396,7 +2420,9 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 31a8d93..e71cf6d 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -9,7 +9,6 @@
 
 #include "Assert.h"
 #include "../types.h"
-#include "MemoryUtil.h"
 
 namespace Common
 {
@@ -41,8 +40,6 @@ public:
   CodeBlock() = default;
   virtual ~CodeBlock()
   {
-    if (region)
-      FreeCodeSpace();
   }
   CodeBlock(const CodeBlock&) = delete;
   CodeBlock& operator=(const CodeBlock&) = delete;
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 116d2da..46f5f9f 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -38,8 +38,10 @@ uiWindow* win;
 
 uiCheckbox* cbDirectBoot;
 
+#ifdef JIT_ENABLED
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
+#endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
 {
@@ -57,13 +59,17 @@ void OnOk(uiButton* btn, void* blarg)
 {
     Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
 
+#ifdef JIT_ENABLED
     Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled);
-    long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10);
+    char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
+    long blockSize = strtol(maxBlockSizeStr, NULL, 10);
+    uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
     Config::JIT_MaxBlockSize = blockSize;
+#endif
 
     Config::Save();
 
@@ -73,6 +79,7 @@ void OnOk(uiButton* btn, void* blarg)
     ApplyNewSettings(4);
 }
 
+#ifdef JIT_ENABLED
 void OnJITStateChanged(uiCheckbox* cb, void* blarg)
 {
     if (uiCheckboxChecked(cb))
@@ -80,6 +87,7 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg)
     else
         uiControlDisable(uiControl(enJITMaxBlockSize));
 }
+#endif
 
 void Open()
 {
@@ -90,7 +98,7 @@ void Open()
     }
 
     opened = true;
-    win = uiNewWindow("Emu settings - melonDS", 300, 170, 0, 0, 0);
+    win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0);
     uiWindowSetMargined(win, 1);
     uiWindowOnClosing(win, OnCloseWindow, NULL);
 
@@ -105,6 +113,7 @@ void Open()
         uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0);
     }
 
+#ifdef JIT_ENABLED
     {
         uiLabel* dummy = uiNewLabel("");
         uiBoxAppend(top, uiControl(dummy), 0);
@@ -133,6 +142,12 @@ void Open()
             uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
         }
     }
+#endif
+
+    {
+        uiLabel* dummy = uiNewLabel("");
+        uiBoxAppend(top, uiControl(dummy), 0);
+    }
 
     {
         uiBox* in_ctrl = uiNewHorizontalBox();
@@ -153,6 +168,7 @@ void Open()
 
     uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
 
+#ifdef JIT_ENABLED
     uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable);
     {
         char maxBlockSizeStr[10];
@@ -160,6 +176,7 @@ void Open()
         uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
     }
     OnJITStateChanged(cbJITEnabled, NULL);
+#endif
 
     uiControlShow(uiControl(win));
 }
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index d6809c3..af05d7a 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2411,8 +2411,10 @@ void ApplyNewSettings(int type)
     }
     else if (type == 4)
     {
+#ifdef JIT_ENABLED
         if (Config::JIT_Enable)
             ARMJIT::InvalidateBlockCache();
+#endif
     }
 
     EmuRunning = prevstatus;
-- 
cgit v1.2.3


From 8ddc4d5904bafa72a6822bb2f487c9d7f100eb16 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 15 Jul 2019 19:17:10 +0200
Subject: jit: fix BLX_reg with rn=lr

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp | 3 ++-
 src/ARM_InstrInfo.cpp            | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 05c8ec6..1f95a90 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -187,9 +187,10 @@ void Compiler::A_Comp_BranchImm()
 void Compiler::A_Comp_BranchXchangeReg()
 {
     OpArg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(32, R(RSCRATCH), rn);
     if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
         MOV(32, MapReg(14), Imm32(R15 - 4));
-    Comp_JumpTo(rn.GetSimpleReg());
+    Comp_JumpTo(RSCRATCH);
 }
 
 void Compiler::T_Comp_BCOND()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b8dff00..c36d6c1 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -359,10 +359,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         }
 
         if (data & A_Link)
-        {
             res.DstRegs |= 1 << 14;
-            res.SrcRegs |= 1 << 15;
-        }
 
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
-- 
cgit v1.2.3


From 54985be1573710ae39f3c485141b8cbfd3bdf64c Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 15 Jul 2019 20:34:08 +0200
Subject: jit: LDM/STM keep proper stack alignment

---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 15a40f8..ee0a7af 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -480,11 +480,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
 
+    // we need to make sure that the stack stays aligned to 16 bytes
+    u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
+
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
         CALL(Num == 0
@@ -508,7 +511,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     POP(ABI_PARAM3);
                     CALL(WriteBanked);
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
-                    if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg))
+                    if (RegCache.Mapping[reg] != INVALID_REG)
                         MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
                     SaveReg(reg, ABI_PARAM3);
                     SetJumpTarget(sucessfulWritten);
@@ -529,6 +532,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
         }
 
+        if (regsCount & 1)
+            POP(RSCRATCH);
+
         if (regs[15])
         {
             if (Num == 1)
@@ -543,6 +549,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        if (regsCount & 1)
+            PUSH(RSCRATCH);
+
         bool firstUserMode = true;
         for (int reg : regs)
         {
@@ -572,6 +581,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 PUSH(MapReg(reg).GetSimpleReg());
             }
         }
+
         MOV(64, R(ABI_PARAM2), R(RSP));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
@@ -579,7 +589,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? MemoryFuncsSeq9[1][preinc]
             : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
 
-        ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     }
 
     return offset;
-- 
cgit v1.2.3


From be8846e31a80bef098cfa03cef5748d3d8011715 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Wed, 17 Jul 2019 03:18:37 +0200
Subject: jit: fix misc static branch things

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp   | 27 +++++++++++++++++++++++----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 15 ++++++++++-----
 src/ARM_InstrInfo.cpp              | 11 ++++-------
 3 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 1f95a90..6ae4aad 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -35,6 +35,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         u32 newregion = addr >> 24;
 
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
         MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
@@ -53,7 +54,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             if (addr & 0x2)
             {
                 nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
-                cycles += CurCPU->CodeCycles;
+                cycles += cpu9->CodeCycles;
                 nextInstr[1] = cpu9->CodeRead32(addr+2, false);
                 cycles += CurCPU->CodeCycles;
             }
@@ -61,7 +62,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             {
                 nextInstr[0] = cpu9->CodeRead32(addr, true);
                 nextInstr[1] = nextInstr[0] >> 16;
-                cycles += CurCPU->CodeCycles;
+                cycles += cpu9->CodeCycles;
             }
         }
         else
@@ -74,6 +75,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             nextInstr[1] = cpu9->CodeRead32(addr+4, false);
             cycles += cpu9->CodeCycles;
         }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+        if (setupRegion)
+            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -86,26 +91,40 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = codeCycles;
 
         MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
 
         if (addr & 0x1)
         {
             addr &= ~0x1;
             newPC = addr+2;
 
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
             nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
             nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
             cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
         }
         else
         {
             addr &= ~0x3;
             newPC = addr+4;
 
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
             nextInstr[0] = cpu7->CodeRead32(addr);
             nextInstr[1] = cpu7->CodeRead32(addr+4);
             cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
         }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
     }
 
     MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
@@ -204,7 +223,7 @@ void Compiler::T_Comp_BCOND()
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
     Comp_AddCycles_C(true);
-    SetJumpTarget(skipFailed);
+   SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 18cb27e..1e871fd 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -354,8 +354,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     if (IsAlmostFull())
         InvalidateBlockCache();
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
     ConstantCycles = 0;
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
@@ -363,6 +361,13 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
+    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
+
+    if (!IsMapped(Num, R15 - Thumb ? 2 : 4))
+    {
+        printf("Trying to compile a block in unmapped memory\n");
+    }
+
     bool mergedThumbBL = false;
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
@@ -383,7 +388,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (comp == NULL || i == instrsCount - 1)
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
             MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
@@ -454,10 +460,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 else
                     (this->*comp)();
 
-                FixupBranch skipFailed;
                 if (CurInstr.Cond() < 0xE)
                 {
-                    skipFailed = J();
+                    FixupBranch skipFailed = J();
                     SetJumpTarget(skipExecute);
 
                     Comp_AddCycles_C();
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index c36d6c1..5db2471 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -178,7 +178,6 @@ enum {
 
     T_ReadR13       = 1 << 9,
     T_WriteR13      = 1 << 10,
-    T_ReadR15       = 1 << 11,
 
     T_BranchAlways  = 1 << 12,
     T_ReadR14       = 1 << 13,
@@ -222,7 +221,7 @@ const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
 const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
 const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
-const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL);
+const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
 const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
 
@@ -257,11 +256,11 @@ const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
 const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
 const u32 T_B = T_BranchAlways | tk(tk_B);
-const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
-const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
+const u32 T_BL_LONG_1 = T_WriteR14 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | tk(tk_BL_LONG_2);
 
 const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
-const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC);
 
 #define INSTRFUNC_PROTO(x) u32 x
 #include "ARM_InstrTable.h"
@@ -299,8 +298,6 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SrcRegs |= (1 << 13);
         if (data & T_WriteR13)
             res.DstRegs |= (1 << 13);
-        if (data & T_ReadR15)
-            res.SrcRegs |= (1 << 15);
         if (data & T_WriteR14)
             res.DstRegs |= (1 << 14);
         if (data & T_ReadR14)
-- 
cgit v1.2.3


From 9d180c7bbc8ccb3459ab2ab14dd2adc7a0f71cf3 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 13:36:48 +0200
Subject: jit: decrease blockcache AddrMapping size for ARM9

---
 src/ARM.cpp                        |  8 ++---
 src/ARMJIT.cpp                     | 18 ++++++----
 src/ARMJIT.h                       | 67 ++++++++++++++++++++++++++++----------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  4 ++-
 src/NDS.cpp                        | 12 +++----
 5 files changed, 74 insertions(+), 35 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index eb58d02..b68b5eb 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -566,14 +566,14 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped(0, instrAddr))
+        if (!ARMJIT::IsMapped<0>(instrAddr))
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr);
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
         Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         if (Halted)
@@ -697,13 +697,13 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped(1, instrAddr))
+        if (!ARMJIT::IsMapped<1>(instrAddr))
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr);
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
         Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         // TODO optimize this shit!!!
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index e8e6be0..aad14c0 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -109,11 +109,14 @@ void Init()
 {
     memset(&cache, 0, sizeof(BlockCache));
 
-    for (int cpu = 0; cpu < 2; cpu++)
-        for (int i = 0; i < 0x4000; i++)
-            cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL :
-				(CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9])
-                + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1);
+	for (int i = 0; i < 0x2000; i++)
+		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
+			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
+			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	for (int i = 0; i < 0x4000; i++)
+		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
+			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
+			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
 
 	compiler = new Compiler();
 }
@@ -175,7 +178,10 @@ CompiledBlock CompileBlock(ARM* cpu)
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, blockAddr, block);
+	if (cpu->Num == 0)
+    	InsertBlock<0>(blockAddr, block);
+	else
+    	InsertBlock<1>(blockAddr, block);
 
 	return block;
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 004256c..0fc1c38 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -47,9 +47,11 @@ struct FetchedInstr
 		a function which executes a block instructions starting from there.
 
 		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x4000 16 KB blocks, each of which a pointer to the relevant
-		place inside the before mentioned arrays. Only half of the bytes need to be
-		addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary).
+		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
+		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
+		are the sizes of the smallest contigous memory region mapped to the respective CPU.
+		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
+		we only need every second half word to be adressable.
 
 		In case a memory write hits mapped memory, the function block at this
 		address is set to null, so it's recompiled the next time it's executed.
@@ -61,7 +63,8 @@ struct FetchedInstr
 
 struct BlockCache
 {
-    CompiledBlock* AddrMapping[2][0x4000] = {0};
+    CompiledBlock* AddrMapping9[0x2000] = {0};
+    CompiledBlock* AddrMapping7[0x4000] = {0};
 
     CompiledBlock MainRAM[4*1024*1024/2];
 	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
@@ -75,35 +78,63 @@ struct BlockCache
 
 extern BlockCache cache;
 
-inline bool IsMapped(u32 num, u32 addr)
+template <u32 num>
+inline bool IsMapped(u32 addr)
 {
-	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+	if (num == 0)
+		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+	else
+		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
 }
 
-inline CompiledBlock LookUpBlock(u32 num, u32 addr)
+template <u32 num>
+inline CompiledBlock LookUpBlock(u32 addr)
 {
-	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+	if (num == 0)
+		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+	else
+		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
 }
 
-inline void Invalidate16(u32 num, u32 addr)
+template <u32 num>
+inline void Invalidate16(u32 addr)
 {
-	if (IsMapped(num, addr))
-		cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+	if (IsMapped<num>(addr))
+	{
+		if (num == 0)
+			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
+		else
+			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+	}
 }
 
-inline void Invalidate32(u32 num, u32 addr)
+template <u32 num>
+inline void Invalidate32(u32 addr)
 {
-	if (IsMapped(num, addr))
+	if (IsMapped<num>(addr))
 	{
-		CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
-		page[(addr & 0x3FFF) >> 1] = NULL;
-		page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+		if (num == 0)
+		{
+			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+			page[(addr & 0x7FFF) >> 1] = NULL;
+			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
+		}
+		else
+		{
+			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+			page[(addr & 0x3FFF) >> 1] = NULL;
+			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+		}
 	}
 }
 
-inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
+template <u32 num>
+inline void InsertBlock(u32 addr, CompiledBlock func)
 {
-	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	if (num == 0)
+		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
+	else
+		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
 }
 
 void Init();
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 1e871fd..cb11f73 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -363,7 +363,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
-    if (!IsMapped(Num, R15 - Thumb ? 2 : 4))
+    if (!(Num == 0 
+        ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) 
+        : IsMapped<1>(R15 - (Thumb ? 2 : 4))))
     {
         printf("Trying to compile a block in unmapped memory\n");
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 62a52aa..cab78b5 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1864,7 +1864,7 @@ u32 ARM9Read32(u32 addr)
 void ARM9Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(0, addr);
+    ARMJIT::Invalidate16<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -1918,7 +1918,7 @@ void ARM9Write8(u32 addr, u8 val)
 void ARM9Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(0, addr);
+    ARMJIT::Invalidate16<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -1988,7 +1988,7 @@ void ARM9Write16(u32 addr, u16 val)
 void ARM9Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32(0, addr);
+    ARMJIT::Invalidate32<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -2285,7 +2285,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(1, addr);
+    ARMJIT::Invalidate16<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2348,7 +2348,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(1, addr);
+    ARMJIT::Invalidate16<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2421,7 +2421,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32(1, addr);
+    ARMJIT::Invalidate32<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
-- 
cgit v1.2.3


From 4a0f6b3b4bd60815d0c8259e4ec2a944bfb716be Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 17:28:16 +0200
Subject: jit: fix thumb hi reg alu and mcr halt + mcr/mrc aren't always,
 msr_imm is never unk on ARM7

---
 src/ARMJIT.cpp                   |  2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp    |  4 +---
 src/ARMJIT_x64/ARMJIT_Branch.cpp | 21 ++++++++++++++-------
 src/ARM_InstrInfo.cpp            | 33 ++++++++++++++++++++++++++++-----
 src/ARM_InstrInfo.h              |  1 +
 5 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index aad14c0..6948eee 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -174,7 +174,7 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
-    } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 013f54c..bdf06f7 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg()
     switch (op)
     {
     case 0x0: // ADD
-        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric);
         break;
     case 0x1: // CMP
         Comp_CmpOp(2, rdMapped, rs, false);
@@ -671,8 +671,6 @@ void Compiler::T_Comp_ALU_HiReg()
     case 0x2: // MOV
         if (rdMapped != rs)
             MOV(32, rdMapped, rs);
-        TEST(32, rdMapped, rdMapped);
-        Comp_RetriveFlags(false, false, false);
         break;
     }
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 6ae4aad..9d4c1e2 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -235,16 +235,23 @@ void Compiler::T_Comp_B()
 void Compiler::T_Comp_BranchXchangeReg()
 {
     bool link = CurInstr.Instr & (1 << 7);
-    if (link && Num == 1)
-    {
-        printf("BLX unsupported on ARM7!!!\n");
-        return;
-    }
 
-    OpArg rn = MapReg(CurInstr.A_Reg(3));
     if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3)));
         MOV(32, MapReg(14), Imm32(R15 - 1));
-    Comp_JumpTo(rn.GetSimpleReg());
+        Comp_JumpTo(RSCRATCH);
+    }
+    else
+    {
+        OpArg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn.GetSimpleReg());
+    }
 }
 
 void Compiler::T_Comp_BL_LONG_1()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 5db2471..b70c8dc 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -152,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
 const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
 
 const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
-const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM);
-const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG);
-const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS);
-const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR);
-const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC);
+const u32 A_MSR_IMM = ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | ak(ak_MRC);
 const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
@@ -310,6 +310,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.DstRegs |= 1 << 15;
 
         res.Kind = (data >> 16) & 0x3F;
+        res.EndBlock = res.Branches();
 
         return res;
     }
@@ -324,6 +325,26 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         res.Kind = (data >> 13) & 0x1FF;
 
+        if (res.Kind == ak_MCR)
+        {
+            u32 cn = (instr >> 16) & 0xF;
+            u32 cm = instr & 0xF;
+            u32 cpinfo = (instr >> 5) & 0x7;
+            u32 id = (cn<<8)|(cm<<4)|cpinfo;
+            if (id == 0x704 || id == 0x782)
+                res.EndBlock |= true;
+        }
+        if (res.Kind == ak_MCR || res.Kind == ak_MRC)
+        {
+            u32 cp = ((instr >> 8) & 0xF);
+            if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
+            {
+                printf("happens\n");
+                data = A_UNK;
+                res.Kind = ak_UNK;
+            }
+        }
+
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
         if (data & A_Read16)
@@ -361,6 +382,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        res.EndBlock |= res.Branches();
+
         return res;
     }
 }
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 51dcfa2..4fe9b10 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -220,6 +220,7 @@ struct Info
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    bool EndBlock;
     bool Branches()
     {
         return DstRegs & (1 << 15);
-- 
cgit v1.2.3


From 51b6b7a7d56e727e164c6ef7cdde3d3e0f4b058e Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 23:56:24 +0200
Subject: fix uninitialised memory mapping

---
 src/ARM.cpp  | 1 -
 src/CP15.cpp | 2 ++
 src/NDS.cpp  | 9 ++++++++-
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index b68b5eb..868c287 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -102,7 +102,6 @@ void ARM::Reset()
 
 void ARMv5::Reset()
 {
-    CP15Reset();
     ARM::Reset();
 }
 
diff --git a/src/CP15.cpp b/src/CP15.cpp
index e6e91c3..b24c1c1 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -257,9 +257,11 @@ void ARMv5::UpdatePURegions(bool update_all)
 
 void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend)
 {
+    printf("initialising region timings %x %x\n", addrstart, addrend);
     addrstart >>= 12;
     addrend   >>= 12;
 
+
     if (addrend == 0xFFFFF) addrend++;
 
     for (u32 i = addrstart; i < addrend; i++)
diff --git a/src/NDS.cpp b/src/NDS.cpp
index cab78b5..1baa308 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -247,7 +247,9 @@ void SetARM9RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq,
         ARM9MemTimings[i][3] = S32;
     }
 
-    ARM9->UpdateRegionTimings(addrstart<<14, addrend<<14);
+    ARM9->UpdateRegionTimings(addrstart<<14, addrend == 0x40000
+        ? 0xFFFFFFFF
+        : (addrend<<14));
 }
 
 void SetARM7RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq, int seq)
@@ -459,6 +461,11 @@ void Reset()
         fclose(f);
     }
 
+    // has to be called before InitTimings
+    // otherwise some PU settings are completely
+    // unitialised on the first run
+    ARM9->CP15Reset();
+
     // TODO for later: configure this when emulating a DSi
     ARM9ClockShift = 1;
 
-- 
cgit v1.2.3


From 707da1f4c78d21c465ae00696d92bdb324dc0c0e Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 23:59:02 +0200
Subject: remove debug printf

---
 src/CP15.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/CP15.cpp b/src/CP15.cpp
index b24c1c1..e6e91c3 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -257,11 +257,9 @@ void ARMv5::UpdatePURegions(bool update_all)
 
 void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend)
 {
-    printf("initialising region timings %x %x\n", addrstart, addrend);
     addrstart >>= 12;
     addrend   >>= 12;
 
-
     if (addrend == 0xFFFFF) addrend++;
 
     for (u32 i = addrstart; i < addrend; i++)
-- 
cgit v1.2.3


From f31976fed0c0c61e403ccaee5154c1f25d24d60d Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 22 Jul 2019 01:04:42 +0200
Subject: jit: fix RSC

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index bdf06f7..368fd8b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -181,7 +181,7 @@ void Compiler::A_Comp_Arith()
         Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
         break;
     case 0x6: // SBC
-        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
         break;
     case 0x7: // RSC
         Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
-- 
cgit v1.2.3


From 5e443e79625b66daf15350d68921d74673cb5232 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 16 Aug 2019 23:17:08 +0200
Subject: remove unneeded dolphin code, C++11 static_assert

---
 src/ARMJIT.cpp                      |  2 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  4 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 19 ++++----
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  5 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  2 +-
 src/dolphin/Assert.h                | 47 -------------------
 src/dolphin/CodeBlock.h             | 91 -------------------------------------
 src/dolphin/Compat.h                | 63 +++++++++++++++++++++++++
 src/dolphin/Intrinsics.h            | 72 -----------------------------
 src/dolphin/Log.h                   | 21 ---------
 src/dolphin/x64CPUDetect.cpp        |  1 -
 src/dolphin/x64Emitter.cpp          |  3 +-
 src/dolphin/x64Emitter.h            | 13 +-----
 13 files changed, 84 insertions(+), 259 deletions(-)
 delete mode 100644 src/dolphin/Assert.h
 delete mode 100644 src/dolphin/CodeBlock.h
 create mode 100644 src/dolphin/Compat.h
 delete mode 100644 src/dolphin/Intrinsics.h
 delete mode 100644 src/dolphin/Log.h

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 6948eee..74554d7 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -188,6 +188,8 @@ CompiledBlock CompileBlock(ARM* cpu)
 
 void InvalidateBlockCache()
 {
+	printf("Resetting JIT block cache...\n");
+
 	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
 	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
 	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 368fd8b..f0bcf8e 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -257,7 +257,7 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O
         Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1);
     }
 
-    static_assert(EAX == RSCRATCH);
+    static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!");
     MOV(32, R(RSCRATCH), rm);
     if (add)
     {
@@ -383,7 +383,7 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     }
 
     MOV(32, R(RSCRATCH), rm);
-    static_assert(RSCRATCH3 == ECX);
+    static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3");
     MOV(32, R(ECX), rs);
     AND(32, R(ECX), Imm32(0xFF));
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index cb11f73..0fbcfda 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -63,12 +63,11 @@ Compiler::Compiler()
         mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
     #endif
 
-        region = pageAligned;
-        region_size = alignedSize;
-        total_region_size = region_size;
+        ResetStart = pageAligned;
+        CodeMemSize = alignedSize;
     }
 
-    ClearCodeSpace();
+    Reset();
 
     for (int i = 0; i < 3; i++)
     {
@@ -169,9 +168,8 @@ Compiler::Compiler()
     }
 
     // move the region forward to prevent overwriting the generated functions
-    region_size -= GetWritableCodePtr() - region;
-    total_region_size = region_size;
-    region = GetWritableCodePtr();
+    CodeMemSize -= GetWritableCodePtr() - ResetStart;
+    ResetStart = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -208,7 +206,7 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
     if (cond >= 0x8)
     {
-        static_assert(RSCRATCH3 == ECX);
+        static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
         MOV(32, R(RSCRATCH3), R(RCPSR));
         SHR(32, R(RSCRATCH3), Imm8(28));
         MOV(32, R(RSCRATCH), Imm32(1));
@@ -346,12 +344,13 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    ClearCodeSpace();
+    memset(ResetStart, 0xcc, CodeMemSize);
+    SetCodePtr(ResetStart);
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
-    if (IsAlmostFull())
+    if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         InvalidateBlockCache();
 
     ConstantCycles = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 0ce7d8d..3151cbc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -17,7 +17,7 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
 
-class Compiler : public Gen::X64CodeBlock
+class Compiler : public Gen::XEmitter
 {
 public:
     Compiler();
@@ -132,6 +132,9 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    u8* ResetStart;
+    u32 CodeMemSize;
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index ee0a7af..6386f8b 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -171,7 +171,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
     RET();
 
-    static_assert(RSCRATCH == EAX);
+    static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!");
 
     return res;
 }
diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h
deleted file mode 100644
index 4eb16e0..0000000
--- a/src/dolphin/Assert.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2015 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <assert.h>
-
-#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (!(_a_))                                                                                    \
-    {                                                                                              \
-      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
-        Crash();                                                                                   \
-    }                                                                                              \
-  } while (0)*/
-
-#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
-  assert(_a_); \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
-    {                                                                                              \
-      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
-      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
-        Crash();                                                                                   \
-    }                                                                                              \
-  } while (0)*/
-
-#define ASSERT(_a_)                                                                                \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
-               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
-               __LINE__, __FILE__);                                                                \
-  } while (0)*/
-
-#define DEBUG_ASSERT(_a_)                                                                          \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
-      ASSERT(_a_);                                                                                 \
-  } while (0)*/
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
deleted file mode 100644
index e71cf6d..0000000
--- a/src/dolphin/CodeBlock.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2014 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <vector>
-
-#include "Assert.h"
-#include "../types.h"
-
-namespace Common
-{
-// Everything that needs to generate code should inherit from this.
-// You get memory management for free, plus, you can use all emitter functions without
-// having to prefix them with gen-> or something similar.
-// Example implementation:
-// class JIT : public CodeBlock<ARMXEmitter> {}
-template <class T>
-class CodeBlock : public T
-{
-private:
-  // A privately used function to set the executable RAM space to something invalid.
-  // For debugging usefulness it should be used to set the RAM to a host specific breakpoint
-  // instruction
-  virtual void PoisonMemory() = 0;
-
-protected:
-  u8* region = nullptr;
-  // Size of region we can use.
-  size_t region_size = 0;
-  // Original size of the region we allocated.
-  size_t total_region_size = 0;
-
-  bool m_is_child = false;
-  std::vector<CodeBlock*> m_children;
-
-public:
-  CodeBlock() = default;
-  virtual ~CodeBlock()
-  {
-  }
-  CodeBlock(const CodeBlock&) = delete;
-  CodeBlock& operator=(const CodeBlock&) = delete;
-  CodeBlock(CodeBlock&&) = delete;
-  CodeBlock& operator=(CodeBlock&&) = delete;
-
-  // Always clear code space with breakpoints, so that if someone accidentally executes
-  // uninitialized, it just breaks into the debugger.
-  void ClearCodeSpace()
-  {
-    PoisonMemory();
-    ResetCodePtr();
-  }
-
-  bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
-
-  void ResetCodePtr() { T::SetCodePtr(region); }
-  size_t GetSpaceLeft() const
-  {
-    ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
-    return region_size - (T::GetCodePtr() - region);
-  }
-
-  bool IsAlmostFull() const
-  {
-    // This should be bigger than the biggest block ever.
-    return GetSpaceLeft() < 0x10000;
-  }
-
-  bool HasChildren() const { return region_size != total_region_size; }
-  u8* AllocChildCodeSpace(size_t child_size)
-  {
-    ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
-    u8* child_region = region + region_size - child_size;
-    region_size -= child_size;
-    return child_region;
-  }
-  void AddChildCodeSpace(CodeBlock* child, size_t child_size)
-  {
-    u8* child_region = AllocChildCodeSpace(child_size);
-    child->m_is_child = true;
-    child->region = child_region;
-    child->region_size = child_size;
-    child->total_region_size = child_size;
-    child->ResetCodePtr();
-    m_children.emplace_back(child);
-  }
-};
-}  // namespace Common
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
new file mode 100644
index 0000000..f2f52a5
--- /dev/null
+++ b/src/dolphin/Compat.h
@@ -0,0 +1,63 @@
+// Stubs for Assert.h and Log.h
+#pragma once
+
+#include <assert.h>
+
+// Assert stub
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
+
+// Log Stub
+#include <cstdio>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h
deleted file mode 100644
index 483f219..0000000
--- a/src/dolphin/Intrinsics.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2015 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#if defined(_M_X86)
-
-/**
- * It is assumed that all compilers used to build Dolphin support intrinsics up to and including
- * SSE 4.2 on x86/x64.
- */
-
-#if defined(__GNUC__) || defined(__clang__)
-
-/**
- * Due to limitations in GCC, SSE intrinsics are only available when compiling with the
- * corresponding instruction set enabled. However, using the target attribute, we can compile
- * single functions with a different target instruction set, while still creating a generic build.
- *
- * Since this instruction set is enabled per-function, any callers should verify that the
- * instruction set is supported at runtime before calling it, and provide a fallback implementation
- * when not supported.
- *
- * When building with -march=native, or enabling the instruction sets in the compile flags, permit
- * usage of the instrinsics without any function attributes. If the command-line architecture does
- * not support this instruction set, enable it via function targeting.
- */
-
-#include <x86intrin.h>
-#ifndef __SSE4_2__
-#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]]
-#endif
-#ifndef __SSE4_1__
-#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]]
-#endif
-#ifndef __SSSE3__
-#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]]
-#endif
-#ifndef __SSE3__
-#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]]
-#endif
-
-#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
-
-/**
- * MSVC and ICC support intrinsics for any instruction set without any function attributes.
- */
-#include <intrin.h>
-
-#endif  // defined(_MSC_VER) || defined(__INTEL_COMPILER)
-
-#endif  // _M_X86
-
-/**
- * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform.
- * This way when a function is defined with FUNCTION_TARGET you don't need to define a second
- * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use
- * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures.
- */
-#ifndef FUNCTION_TARGET_SSE42
-#define FUNCTION_TARGET_SSE42
-#endif
-#ifndef FUNCTION_TARGET_SSR41
-#define FUNCTION_TARGET_SSR41
-#endif
-#ifndef FUNCTION_TARGET_SSSE3
-#define FUNCTION_TARGET_SSSE3
-#endif
-#ifndef FUNCTION_TARGET_SSE3
-#define FUNCTION_TARGET_SSE3
-#endif
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
deleted file mode 100644
index a7f4b6a..0000000
--- a/src/dolphin/Log.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include "CommonFuncs.h"
-
-#include <stdio.h>
-
-#define PanicAlert(fmt, ...) \
-  do \
-  { \
-    printf(fmt "\n", ## __VA_ARGS__); \
-    abort(); \
-  } while (false)
-
-
-#define DYNA_REC 0
-
-#define ERROR_LOG(which, fmt, ...) \
-    do \
-    { \
-        printf(fmt "\n", ## __VA_ARGS__); \
-    } while (false)
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
index 05ee11c..49b51c9 100644
--- a/src/dolphin/x64CPUDetect.cpp
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -7,7 +7,6 @@
 
 #include "CPUDetect.h"
 #include "../types.h"
-#include "Intrinsics.h"
 
 #ifndef _MSVC_VER
 
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
index 7849624..343f314 100644
--- a/src/dolphin/x64Emitter.cpp
+++ b/src/dolphin/x64Emitter.cpp
@@ -7,9 +7,10 @@
 
 #include "CPUDetect.h"
 #include "../types.h"
-#include "Log.h"
 #include "x64Emitter.h"
 #include "x64Reg.h"
+#include "Compat.h"
+#include "CommonFuncs.h"
 
 namespace Gen
 {
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
index 122850d..869acb6 100644
--- a/src/dolphin/x64Emitter.h
+++ b/src/dolphin/x64Emitter.h
@@ -12,9 +12,8 @@
 #include <tuple>
 #include <type_traits>
 
-#include "Assert.h"
+#include "Compat.h"
 #include "BitSet.h"
-#include "CodeBlock.h"
 #include "../types.h"
 #include "x64ABI.h"
 
@@ -1167,14 +1166,4 @@ public:
   }
 };  // class XEmitter
 
-class X64CodeBlock : public Common::CodeBlock<XEmitter>
-{
-private:
-  void PoisonMemory() override
-  {
-    // x86/64: 0xCC = breakpoint
-    memset(region, 0xCC, region_size);
-  }
-};
-
 }  // namespace
-- 
cgit v1.2.3


From ec21172cd9932805f02d84f41599c7a23e3b23f5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 17 Aug 2019 14:58:37 +0200
Subject: fix register alloc for half word loads fixes Mega Man Star Force 2
 with cheat applied it probably used a pc relative load which were interpreted
 as branches

---
 src/ARM_InstrInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b70c8dc..4813799 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -127,8 +127,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 #define A_STRD A_Read12Double
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
-    const u32 A_##x##_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG); \
     const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
     const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
 
-- 
cgit v1.2.3


From 3001d9492c6e7e83e82843a4b9c6186b0b58f5e5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 17 Aug 2019 16:50:48 +0200
Subject: abandon pipelining on jit fixes Golden Sun Dawn this makes the cpu
 state incompatible between interpreter and JIT. That's why switching cpu mode
 requires a restart(not requiring is stupid anyway) and the pipeline is
 manually filled when making a save state.

---
 src/ARM.cpp                                | 46 +++++++++++++++++++++++++++++-
 src/ARM.h                                  |  6 ++++
 src/ARMJIT.cpp                             |  1 +
 src/ARMJIT_x64/ARMJIT_Branch.cpp           | 39 +++++++++++--------------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp         |  5 ----
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp        |  5 ----
 src/libui_sdl/DlgEmuSettings.cpp           | 28 ++++++++++++++----
 src/libui_sdl/libui/ui.h                   |  1 +
 src/libui_sdl/libui/windows/stddialogs.cpp | 17 +++++++++--
 src/libui_sdl/main.cpp                     | 16 +++++------
 10 files changed, 116 insertions(+), 48 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 868c287..e404943 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -22,6 +22,7 @@
 #include "ARMInterpreter.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
+#include "Config.h"
 
 
 // instruction timing notes
@@ -122,6 +123,13 @@ void ARM::DoSavestate(Savestate* file)
     file->VarArray(R_IRQ, 3*sizeof(u32));
     file->VarArray(R_UND, 3*sizeof(u32));
     file->Var32(&CurInstr);
+    if (!file->Saving && Config::JIT_Enable)
+    {
+        // hack, the JIT doesn't really pipeline
+        // but we still want JIT save states to be
+        // loaded while running the interpreter
+        FillPipeline();
+    }
     file->VarArray(NextInstr, 2*sizeof(u32));
 
     file->Var32(&ExceptionBase);
@@ -724,4 +732,40 @@ void ARMv4::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
-#endif
\ No newline at end of file
+#endif
+
+void ARMv5::FillPipeline()
+{
+    if (CPSR & 0x20)
+    {
+        if ((R[15] - 2) & 0x2)
+        {
+            NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16;
+            NextInstr[1] = CodeRead32(R[15], false);
+        }
+        else
+        {
+            NextInstr[0] = CodeRead32(R[15] - 2, false);
+            NextInstr[1] = NextInstr[0] >> 16;
+        }
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4, false);
+        NextInstr[1] = CodeRead32(R[15], false);
+    }
+}
+
+void ARMv4::FillPipeline()
+{
+    if (CPSR & 0x20)
+    {
+        NextInstr[0] = CodeRead16(R[15] - 2);
+        NextInstr[1] = CodeRead16(R[15]);
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4);
+        NextInstr[1] = CodeRead32(R[15]);
+    }
+}
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index ecdf5b4..4d387bc 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -42,6 +42,8 @@ public:
 
     virtual void DoSavestate(Savestate* file);
 
+    virtual void FillPipeline() = 0;
+
     virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0;
     void RestoreCPSR();
 
@@ -148,6 +150,8 @@ public:
 
     void UpdateRegionTimings(u32 addrstart, u32 addrend);
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void PrefetchAbort();
@@ -272,6 +276,8 @@ class ARMv4 : public ARM
 public:
     ARMv4();
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 74554d7..949bc1c 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -139,6 +139,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     int i = 0;
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
     do
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 9d4c1e2..30b18d7 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -4,6 +4,14 @@ using namespace Gen;
 
 namespace ARMJIT
 {
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
     
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
@@ -12,9 +20,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     // we'll see how it works out
 
     u32 newPC;
-    u32 nextInstr[2];
     u32 cycles = 0;
-    bool setupRegion = false;
 
     if (addr & 0x1 && !Thumb)
     {
@@ -40,7 +46,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
         MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
-        setupRegion = newregion != oldregion;
+        bool setupRegion = newregion != oldregion;
         if (setupRegion)
             cpu9->SetupCodeMem(addr);
 
@@ -53,15 +59,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             // doesn't matter if we put garbage in the MSbs there
             if (addr & 0x2)
             {
-                nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
+                cpu9->CodeRead32(addr-2, true);
                 cycles += cpu9->CodeCycles;
-                nextInstr[1] = cpu9->CodeRead32(addr+2, false);
+                cpu9->CodeRead32(addr+2, false);
                 cycles += CurCPU->CodeCycles;
             }
             else
             {
-                nextInstr[0] = cpu9->CodeRead32(addr, true);
-                nextInstr[1] = nextInstr[0] >> 16;
+                cpu9->CodeRead32(addr, true);
                 cycles += cpu9->CodeCycles;
             }
         }
@@ -70,12 +75,15 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             addr &= ~0x3;
             newPC = addr+4;
 
-            nextInstr[0] = cpu9->CodeRead32(addr, true);
+            cpu9->CodeRead32(addr, true);
             cycles += cpu9->CodeCycles;
-            nextInstr[1] = cpu9->CodeRead32(addr+4, false);
+            cpu9->CodeRead32(addr+4, false);
             cycles += cpu9->CodeCycles;
         }
 
+        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
+
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
         if (setupRegion)
             cpu9->SetupCodeMem(R15);
@@ -102,8 +110,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             u32 compileTimePC = CurCPU->R[15];
             CurCPU->R[15] = newPC;
 
-            nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
-            nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
             cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
 
             CurCPU->R[15] = compileTimePC;
@@ -116,8 +122,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             u32 compileTimePC = CurCPU->R[15];
             CurCPU->R[15] = newPC;
 
-            nextInstr[0] = cpu7->CodeRead32(addr);
-            nextInstr[1] = cpu7->CodeRead32(addr+4);
             cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
 
             CurCPU->R[15] = compileTimePC;
@@ -128,19 +132,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     }
 
     MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
-    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0]));
-    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1]));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
-
-    if (setupRegion)
-    {
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        MOV(32, R(ABI_PARAM2), Imm32(newPC));
-        CALL((void*)&ARMv5::SetupCodeMem);
-    }
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 0fbcfda..ab13cb6 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -395,11 +395,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
             MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
             MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-            if (i == instrsCount - 1)
-            {
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0]));
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
-            }
 
             if (comp == NULL)
                 SaveCPSR();
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 6386f8b..3b4cb7d 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -457,11 +457,6 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
     }
 }
 
-void printStuff2(u32 a, u32 b)
-{
-    printf("b %x %x\n", a, b);
-}
-
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
     int regsCount = regs.Count();
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 46f5f9f..2f5ee2d 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -29,6 +29,7 @@
 
 void ApplyNewSettings(int type);
 
+extern bool RunningSomething;
 
 namespace DlgEmuSettings
 {
@@ -57,10 +58,10 @@ void OnCancel(uiButton* btn, void* blarg)
 
 void OnOk(uiButton* btn, void* blarg)
 {
-    Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
-
 #ifdef JIT_ENABLED
-    Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled);
+    bool restart = false;
+
+    bool enableJit = uiCheckboxChecked(cbJITEnabled);
     char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
     long blockSize = strtol(maxBlockSizeStr, NULL, 10);
     uiFreeText(maxBlockSizeStr);
@@ -68,15 +69,32 @@ void OnOk(uiButton* btn, void* blarg)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
-    Config::JIT_MaxBlockSize = blockSize;
+
+    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize)
+    {
+        if (RunningSomething && 
+            !uiMsgBoxConfirm(win, "Reset emulator", 
+                "Changing JIT settings requires a reset.\n\nDo you want to continue?"))
+            return;
+
+        Config::JIT_Enable = enableJit;
+        Config::JIT_MaxBlockSize = Config::JIT_MaxBlockSize;
+
+        restart = true;
+    }
 #endif
 
+    Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
+
     Config::Save();
 
     uiControlDestroy(uiControl(win));
     opened = false;
 
-    ApplyNewSettings(4);
+#ifdef JIT_ENABLED
+    if (restart)
+        ApplyNewSettings(4);
+#endif
 }
 
 #ifdef JIT_ENABLED
diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h
index 03aef5d..e45fe91 100644
--- a/src/libui_sdl/libui/ui.h
+++ b/src/libui_sdl/libui/ui.h
@@ -289,6 +289,7 @@ _UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* in
 _UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath);
 _UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description);
 _UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description);
+_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description);
 
 typedef struct uiArea uiArea;
 typedef struct uiAreaHandler uiAreaHandler;
diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp
index d0fd506..7537015 100644
--- a/src/libui_sdl/libui/windows/stddialogs.cpp
+++ b/src/libui_sdl/libui/windows/stddialogs.cpp
@@ -136,7 +136,7 @@ char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
 
 // TODO switch to TaskDialogIndirect()?
 
-static void msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon)
+static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon)
 {
 	WCHAR *wtitle, *wdescription;
 	HRESULT hr;
@@ -144,12 +144,15 @@ static void msgbox(HWND parent, const char *title, const char *description, TASK
 	wtitle = toUTF16(title);
 	wdescription = toUTF16(description);
 
-	hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, NULL);
+	int result;
+	hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result);
 	if (hr != S_OK)
 		logHRESULT(L"error showing task dialog", hr);
 
 	uiFree(wdescription);
 	uiFree(wtitle);
+
+	return result;
 }
 
 void uiMsgBox(uiWindow *parent, const char *title, const char *description)
@@ -165,3 +168,13 @@ void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
 	msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON);
 	enableAllWindowsExcept(parent);
 }
+
+int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
+{
+	disableAllWindowsExcept(parent);
+	int result =
+		msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON);
+	enableAllWindowsExcept(parent);
+
+	return result == IDOK;
+}
\ No newline at end of file
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index af05d7a..0066668 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2355,6 +2355,14 @@ void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg)
 
 void ApplyNewSettings(int type)
 {
+#ifdef JIT_ENABLED
+    if (type == 4)
+    {
+        Reset(NULL);
+        return;
+    }
+#endif
+
     if (!RunningSomething)
     {
         if (type == 1) return;
@@ -2409,14 +2417,6 @@ void ApplyNewSettings(int type)
         GPU3D::InitRenderer(Screen_UseGL);
         if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
     }
-    else if (type == 4)
-    {
-#ifdef JIT_ENABLED
-        if (Config::JIT_Enable)
-            ARMJIT::InvalidateBlockCache();
-#endif
-    }
-
     EmuRunning = prevstatus;
 }
 
-- 
cgit v1.2.3


From 03ab7f1645f5a5c8427bc53a12f417845a17c980 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 17 Aug 2019 16:54:13 +0200
Subject: fix jit block size not changeable

---
 src/libui_sdl/DlgEmuSettings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 2f5ee2d..09ea8eb 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -78,7 +78,7 @@ void OnOk(uiButton* btn, void* blarg)
             return;
 
         Config::JIT_Enable = enableJit;
-        Config::JIT_MaxBlockSize = Config::JIT_MaxBlockSize;
+        Config::JIT_MaxBlockSize = blockSize;
 
         restart = true;
     }
-- 
cgit v1.2.3


From 5ea91b8a039e0735ac5cb102e2375c26c4f7a150 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 12:28:48 +0200
Subject: optimise away unneeded flag sets - especially useful for thumb code
 and larger max block sizes - can still be improved upon

---
 src/ARMJIT.cpp                     |  24 ++++
 src/ARMJIT.h                       |   1 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp      |  64 +++++++---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |   9 ++
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   6 +-
 src/ARM_InstrInfo.cpp              | 238 +++++++++++++++++++++++--------------
 src/ARM_InstrInfo.h                |  13 ++
 src/libui_sdl/main.cpp             |   2 +
 8 files changed, 248 insertions(+), 109 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 949bc1c..3b6bc2e 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -126,6 +126,24 @@ void DeInit()
 	delete compiler;
 }
 
+void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+{
+	for (int j = start; j >= 0; j--)
+	{
+		u8 match = instrs[j].Info.WriteFlags & flags;
+		u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags;
+		if (matchMaybe) // writes flags maybe
+			instrs[j].SetFlags |= matchMaybe;
+		if (match)
+		{
+			instrs[j].SetFlags |= match;
+			flags &= ~match;
+			if (!flags)
+				return;
+		}
+	}
+}
+
 CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -175,8 +193,14 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
+
+		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
+		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
+			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
 
+	floodFillSetFlags(instrs, i - 1, 0xF);
+
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
 	if (cpu->Num == 0)
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 0fc1c38..6197695 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -28,6 +28,7 @@ struct FetchedInstr
         return Instr >> 28;
     }
 
+	u8 SetFlags;
     u32 Instr;
     u32 NextInstr[2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f0bcf8e..6a7d711 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -111,6 +111,8 @@ OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
     }
     else
     {
+        S = S && (CurInstr.SetFlags & 0x2);
+
         int op = (CurInstr.Instr >> 5) & 0x3;
         if (CurInstr.Instr & (1 << 4))
         {
@@ -215,7 +217,8 @@ void Compiler::A_Comp_MovOp()
 
     if (S)
     {
-        TEST(32, rd, rd);
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, carryUsed);
     }
 
@@ -263,12 +266,14 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O
     {
         IMUL(32, RSCRATCH, rs);
         LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg()));
-        TEST(32, rd, rd);
+        if (S && FlagsNZRequired())
+            TEST(32, rd, rd);
     }
     else
     {
         IMUL(32, RSCRATCH, rs);
         MOV(32, rd, R(RSCRATCH));
+        if (S && FlagsNZRequired())
         TEST(32, R(RSCRATCH), R(RSCRATCH));
     }
 
@@ -331,7 +336,7 @@ void Compiler::A_Comp_SMULL_SMLAL()
     else
     {
         IMUL(64, RSCRATCH2, R(RSCRATCH3));
-        if (S)
+        if (S && FlagsNZRequired())
             TEST(64, R(RSCRATCH2), R(RSCRATCH2));
     }
 
@@ -345,9 +350,20 @@ void Compiler::A_Comp_SMULL_SMLAL()
 
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
-    CPSRDirty = true;
+    if (CurInstr.SetFlags == 0)
+        return;
+    if (retriveCV && !(CurInstr.SetFlags & 0x3))
+        retriveCV = false;
 
     bool carryOnly = !retriveCV && carryUsed;
+    if (carryOnly && !(CurInstr.SetFlags & 0x2))
+    {
+        carryUsed = false;
+        carryOnly = false;
+    }
+
+    CPSRDirty = true;
+
     if (retriveCV)
     {
         SETcc(CC_O, R(RSCRATCH));
@@ -355,19 +371,28 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
     }
 
-    SETcc(CC_S, R(RSCRATCH));
-    SETcc(CC_Z, R(RSCRATCH3));
-    LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
-    int shiftAmount = 30;
-    if (retriveCV || carryUsed)
+    if (FlagsNZRequired())
     {
-        LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
-        shiftAmount = carryOnly ? 29 : 28;
-    }
-    SHL(32, R(RSCRATCH), Imm8(shiftAmount));
+        SETcc(CC_S, R(RSCRATCH));
+        SETcc(CC_Z, R(RSCRATCH3));
+        LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
+        int shiftAmount = 30;
+        if (retriveCV || carryUsed)
+        {
+            LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
+            shiftAmount = carryOnly ? 29 : 28;
+        }
+        SHL(32, R(RSCRATCH), Imm8(shiftAmount));
 
-    AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
-    OR(32, R(RCPSR), R(RSCRATCH));
+        AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH));
+    }
+    else
+    {
+        SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28));
+        AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH2));
+    }
 }
 
 // always uses RSCRATCH, RSCRATCH2 only if S == true
@@ -523,7 +548,8 @@ void Compiler::T_Comp_ShiftImm()
     if (shifted != rd)
         MOV(32, rd, shifted);
 
-    TEST(32, rd, rd);
+    if (FlagsNZRequired())
+        TEST(32, rd, rd);
     Comp_RetriveFlags(false, false, carryUsed);
 }
 
@@ -557,7 +583,8 @@ void Compiler::T_Comp_ALU_Imm8()
     {
     case 0x0:
         MOV(32, rd, imm);
-        TEST(32, rd, rd);
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, false);
         return;
     case 0x1:
@@ -607,7 +634,8 @@ void Compiler::T_Comp_ALU()
             int shiftOp = op == 0x7 ? 3 : op - 0x2;
             bool carryUsed;
             OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
-            TEST(32, shifted, shifted);
+            if (FlagsNZRequired())
+                TEST(32, shifted, shifted);
             MOV(32, rd, shifted);
             Comp_RetriveFlags(false, false, true);
         }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index ab13cb6..6abb2bb 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -342,6 +342,11 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 };
 #undef F
 
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
 void Compiler::Reset()
 {
     memset(ResetStart, 0xcc, CodeMemSize);
@@ -380,11 +385,15 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
+    printf("block start %d\n", Thumb);
+
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
+        printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 3151cbc..8861884 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -29,6 +29,8 @@ public:
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
 
+    bool CanCompile(bool thumb, u16 kind);
+
     typedef void (Compiler::*CompileFunc)();
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
@@ -64,7 +66,6 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
-
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
@@ -121,6 +122,9 @@ public:
     void LoadCPSR();
     void SaveCPSR();
 
+    bool FlagsNZRequired()
+    { return CurInstr.SetFlags & 0xC; }
+
     Gen::FixupBranch CheckCondition(u32 cond);
 
     Gen::OpArg MapReg(int reg)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 4813799..ea6d827 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 13)
+#define ak(x) ((x) << 18)
 
 enum {
     A_Read0             = 1 << 0,
@@ -26,69 +26,81 @@ enum {
     A_Link              = 1 << 10,
 
     A_UnkOnARM7         = 1 << 11,
+
+    A_SetNZ             = 1 << 12,
+    A_SetCV             = 1 << 13,
+    A_SetMaybeC         = 1 << 14,
+    A_MulFlags          = 1 << 15,
+    A_ReadC             = 1 << 16,
+    A_RRXReadC          = 1 << 17,
 };
 
 #define A_BIOP A_Read16
 #define A_MONOOP 0
 
-#define A_IMPLEMENT_ALU_OP(x,k) \
-    const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+#define A_ARITH A_SetCV
+#define A_LOGIC A_SetMaybeC
+#define A_ARITH_IMM A_SetCV
+#define A_LOGIC_IMM 0
+
+#define A_IMPLEMENT_ALU_OP(x,k,a,c) \
+    const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
     \
-    const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
-    const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
-    const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
-    const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
-    const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
-    const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
-    const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
-    const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
-    const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
-
-A_IMPLEMENT_ALU_OP(AND,BIOP)
-A_IMPLEMENT_ALU_OP(EOR,BIOP)
-A_IMPLEMENT_ALU_OP(SUB,BIOP)
-A_IMPLEMENT_ALU_OP(RSB,BIOP)
-A_IMPLEMENT_ALU_OP(ADD,BIOP)
-A_IMPLEMENT_ALU_OP(ADC,BIOP)
-A_IMPLEMENT_ALU_OP(SBC,BIOP)
-A_IMPLEMENT_ALU_OP(RSC,BIOP)
-A_IMPLEMENT_ALU_OP(ORR,BIOP)
-A_IMPLEMENT_ALU_OP(MOV,MONOOP)
-A_IMPLEMENT_ALU_OP(BIC,BIOP)
-A_IMPLEMENT_ALU_OP(MVN,MONOOP)
+    const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0)
 
 const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
 
-#define A_IMPLEMENT_ALU_TEST(x) \
-    const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
-
-A_IMPLEMENT_ALU_TEST(TST)
-A_IMPLEMENT_ALU_TEST(TEQ)
-A_IMPLEMENT_ALU_TEST(CMP)
-A_IMPLEMENT_ALU_TEST(CMN)
-
-const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
-const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
-const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
-const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
-const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
-const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+#define A_IMPLEMENT_ALU_TEST(x,a) \
+    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST,LOGIC)
+A_IMPLEMENT_ALU_TEST(TEQ,LOGIC)
+A_IMPLEMENT_ALU_TEST(CMP,ARITH)
+A_IMPLEMENT_ALU_TEST(CMN,ARITH)
+
+const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
 const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
@@ -161,7 +173,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 16)
+#define tk(x) ((x) << 20)
 
 enum {
     T_Read0         = 1 << 0,
@@ -183,42 +195,47 @@ enum {
     T_ReadR14       = 1 << 13,
     T_WriteR14      = 1 << 14,
 
-    T_PopPC         = 1 << 15
+    T_PopPC         = 1 << 15,
+
+    T_SetNZ         = 1 << 16,
+    T_SetCV         = 1 << 17,
+    T_SetMaybeC     = 1 << 18,
+    T_ReadC         = 1 << 19
 };
 
-const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM);
-const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM);
-const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM);
-
-const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
-const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
-const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
-const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
-
-const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM);
-const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM);
-const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM);
-const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM);
-
-const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
-const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
-const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
-const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
-const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
-const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
-const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
-const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
-const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG);
-const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG);
-const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG);
-const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG);
-const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
-const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
-const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
-const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG);
+const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG);
 
 const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
-const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
 const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
 const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
@@ -268,10 +285,20 @@ const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC);
 
 Info Decode(bool thumb, u32 num, u32 instr)
 {
+    const u8 FlagsReadPerCond[7] = {
+        flag_Z,
+        flag_C,
+        flag_N,
+        flag_V,
+        flag_C | flag_Z,
+        flag_N | flag_V,
+        flag_Z | flag_N | flag_V};
+
     Info res = {0};
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+        res.Kind = (data >> 20) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -309,7 +336,18 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_PopPC && instr & (1 << 8))
             res.DstRegs |= 1 << 15;
 
-        res.Kind = (data >> 16) & 0x3F;
+        if (data & T_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & T_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & T_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if (data & T_ReadC)
+            res.ReadFlags |= flag_C;
+
+        if (res.Kind == tk_BCOND)
+            res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7];
+
         res.EndBlock = res.Branches();
 
         return res;
@@ -323,7 +361,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 13) & 0x1FF;
+        res.Kind = (data >> 18) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -382,6 +420,26 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        if (data & A_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & A_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if ((data & A_MulFlags) && (instr & (1 << 20)))
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_ReadC)
+            res.ReadFlags |= flag_C;
+        if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
+            res.ReadFlags |= flag_C;
+
+        if ((instr >> 28) < 0xE)
+        {
+            // make non conditional flag sets conditional
+            res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4);
+            res.ReadFlags |= FlagsReadPerCond[instr >> 29];
+        }
+
         res.EndBlock |= res.Branches();
 
         return res;
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 4fe9b10..5336837 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -215,11 +215,24 @@ enum
     tk_Count
 };
 
+enum
+{
+    flag_N = 1 << 3,
+    flag_Z = 1 << 2,
+    flag_C = 1 << 1,
+    flag_V = 1 << 0,
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 ReadFlags;
+    // lower 4 bits - set always
+    // upper 4 bits - might set flag
+    u8 WriteFlags;
+
     bool EndBlock;
     bool Branches()
     {
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index 0066668..c3db88d 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2675,6 +2675,8 @@ void RecreateMainWindow(bool opengl)
 
 int main(int argc, char** argv)
 {
+    freopen("miauz.txt", "w", stdout);
+
     srand(time(NULL));
 
     printf("melonDS " MELONDS_VERSION "\n");
-- 
cgit v1.2.3


From b5dda7d6e2f73941435a8e9a6b71804fad068319 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Fri, 23 Aug 2019 22:16:24 +0200
Subject: add ui confirm dialog for linux

---
 src/libui_sdl/libui/unix/stddialogs.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/libui_sdl/libui/unix/stddialogs.c b/src/libui_sdl/libui/unix/stddialogs.c
index 3daeffa..10c598d 100644
--- a/src/libui_sdl/libui/unix/stddialogs.c
+++ b/src/libui_sdl/libui/unix/stddialogs.c
@@ -93,7 +93,7 @@ char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
 	return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_SAVE, "_Save", filter, initpath);
 }
 
-static void msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons)
+static int msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons)
 {
 	GtkWidget *md;
 
@@ -101,8 +101,10 @@ static void msgbox(GtkWindow *parent, const char *title, const char *description
 		type, buttons,
 		"%s", title);
 	gtk_message_dialog_format_secondary_text(GTK_MESSAGE_DIALOG(md), "%s", description);
-	gtk_dialog_run(GTK_DIALOG(md));
+	int result = gtk_dialog_run(GTK_DIALOG(md));
 	gtk_widget_destroy(md);
+
+	return result;
 }
 
 void uiMsgBox(uiWindow *parent, const char *title, const char *description)
@@ -114,3 +116,11 @@ void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
 {
 	msgbox(windowWindow(parent), title, description, GTK_MESSAGE_ERROR, GTK_BUTTONS_OK);
 }
+
+int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
+{
+	int result =
+		msgbox(windowWindow(parent), title, description, GTK_MESSAGE_QUESTION, GTK_BUTTONS_OK_CANCEL);
+
+	return result == GTK_RESPONSE_OK;
+}
\ No newline at end of file
-- 
cgit v1.2.3


From ea562d2fec9f4ab73e9ff3f519ff5ecb65736cd7 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 13:06:27 +0200
Subject: fixes for flag optimisation

---
 src/ARMJIT.cpp                | 1 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +-
 src/ARM_InstrInfo.cpp         | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 3b6bc2e..5d92e47 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -163,6 +163,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 6a7d711..f868ddf 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -387,7 +387,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
         OR(32, R(RCPSR), R(RSCRATCH));
     }
-    else
+    else if (carryUsed || retriveCV)
     {
         SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28));
         AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index ea6d827..3634c35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -436,7 +436,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
-            res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4);
+            res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0;
             res.ReadFlags |= FlagsReadPerCond[instr >> 29];
         }
 
-- 
cgit v1.2.3


From 5202c505abe96e39814e9141e9487e3b549f28a4 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 13:09:03 +0200
Subject: remove debug printing

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ----
 src/libui_sdl/main.cpp             | 2 --
 2 files changed, 6 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 6abb2bb..5e05446 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -385,15 +385,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
-    printf("block start %d\n", Thumb);
-
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
-        printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags);
-
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index c3db88d..0066668 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl)
 
 int main(int argc, char** argv)
 {
-    freopen("miauz.txt", "w", stdout);
-
     srand(time(NULL));
 
     printf("melonDS " MELONDS_VERSION "\n");
-- 
cgit v1.2.3


From 2ef776883f286f938fe03700780544c56867e467 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 8 Sep 2019 14:09:00 +0200
Subject: more fixes for flag optimisation + small cycle counting optimisation

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  4 ++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 28 ++++++++---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  4 ++
 src/ARM_InstrInfo.cpp               | 92 ++++++++++++++++++++++---------------
 5 files changed, 86 insertions(+), 44 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 30b18d7..c0a8f1f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -19,6 +19,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     // it's not completely safe to assume stuff like, which instructions to preload
     // we'll see how it works out
 
+    IrregularCycles = true;
+
     u32 newPC;
     u32 cycles = 0;
 
@@ -140,6 +142,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
+    IrregularCycles = true;
+
     BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 5e05446..d585f39 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -447,6 +447,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 Comp_AddCycles_C();
             else
             {
+                IrregularCycles = false;
+
                 FixupBranch skipExecute;
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
@@ -463,13 +465,19 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                 if (CurInstr.Cond() < 0xE)
                 {
-                    FixupBranch skipFailed = J();
-                    SetJumpTarget(skipExecute);
+                    if (IrregularCycles)
+                    {
+                        FixupBranch skipFailed = J();
+                        SetJumpTarget(skipExecute);
 
-                    Comp_AddCycles_C();
+                        Comp_AddCycles_C(true);
 
-                    SetJumpTarget(skipFailed);
+                        SetJumpTarget(skipFailed);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
                 }
+                
             }
         }
 
@@ -518,8 +526,16 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
     
-    LEA(32, RSCRATCH, MDisp(i, add + cycles));
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+    if (!Thumb && CurInstr.Cond() < 0xE)
+    {
+        LEA(32, RSCRATCH, MDisp(i, add + cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+    }
+    else
+    {
+        ConstantCycles += i + cycles;
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+    }
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 8861884..a62f043 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -139,6 +139,8 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool IrregularCycles;
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 3b4cb7d..bf8280d 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -438,6 +438,8 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 
 void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 {
+    IrregularCycles = true;
+
     if (store)
         MOV(32, R(ABI_PARAM2), rd);
     u32 cycles = Num
@@ -459,6 +461,8 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
+    IrregularCycles = true;
+
     int regsCount = regs.Count();
 
     if (decrement)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 3634c35..9239e29 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 18)
+#define ak(x) ((x) << 21)
 
 enum {
     A_Read0             = 1 << 0,
@@ -33,13 +33,21 @@ enum {
     A_MulFlags          = 1 << 15,
     A_ReadC             = 1 << 16,
     A_RRXReadC          = 1 << 17,
+    A_StaticShiftSetC   = 1 << 18,
+    A_SetC              = 1 << 19,
+
+    A_WriteMemory       = 1 << 20,
 };
 
 #define A_BIOP A_Read16
 #define A_MONOOP 0
 
-#define A_ARITH A_SetCV
-#define A_LOGIC A_SetMaybeC
+#define A_ARITH_LSL_IMM A_SetCV
+#define A_LOGIC_LSL_IMM A_StaticShiftSetC
+#define A_ARITH_SHIFT_IMM A_SetCV
+#define A_LOGIC_SHIFT_IMM A_SetC
+#define A_ARITH_SHIFT_REG A_SetCV
+#define A_LOGIC_SHIFT_REG A_SetMaybeC
 #define A_ARITH_IMM A_SetCV
 #define A_LOGIC_IMM 0
 
@@ -55,14 +63,14 @@ enum {
     const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
     \
     const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
-    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
-    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
-    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
-    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
-    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
-    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
-    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
-    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a##_LSL_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
 
 A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0)
 A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0)
@@ -80,15 +88,15 @@ A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0)
 const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
 
 #define A_IMPLEMENT_ALU_TEST(x,a) \
-    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a##_IMM | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a##_LSL_IMM | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
 
 A_IMPLEMENT_ALU_TEST(TST,LOGIC)
 A_IMPLEMENT_ALU_TEST(TEQ,LOGIC)
@@ -115,20 +123,20 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12
+#define A_STR A_Read12 | A_WriteMemory
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
     const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
     const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
     const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
-    const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
     \
     const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
     const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
     const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
     const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
-    const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
 
 A_IMPLEMENT_WB_LDRSTR(STR,STR)
 A_IMPLEMENT_WB_LDRSTR(STRB,STR)
@@ -136,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double
+#define A_STRD A_Read12Double | A_WriteMemory
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -151,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -173,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 20)
+#define tk(x) ((x) << 21)
 
 enum {
     T_Read0         = 1 << 0,
@@ -200,12 +208,13 @@ enum {
     T_SetNZ         = 1 << 16,
     T_SetCV         = 1 << 17,
     T_SetMaybeC     = 1 << 18,
-    T_ReadC         = 1 << 19
+    T_ReadC         = 1 << 19,
+    T_SetC          = 1 << 20,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
-const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
-const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+const u32 T_LSR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
 
 const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
 const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
@@ -213,7 +222,7 @@ const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
 const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
 
 const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM);
-const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM);
+const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Read8 | tk(tk_CMP_IMM);
 const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM);
 const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM);
 
@@ -240,7 +249,7 @@ const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
 const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
-const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
+const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
@@ -298,7 +307,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 20) & 0x3F;
+        res.Kind = (data >> 21) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -344,12 +353,14 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.WriteFlags |= flag_C << 4;
         if (data & T_ReadC)
             res.ReadFlags |= flag_C;
+        if (data & T_SetC)
+            res.WriteFlags |= flag_C;
+
+        res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
             res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7];
 
-        res.EndBlock = res.Branches();
-
         return res;
     }
     else
@@ -361,7 +372,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 18) & 0x1FF;
+        res.Kind = (data >> 21) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -369,7 +380,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 cm = instr & 0xF;
             u32 cpinfo = (instr >> 5) & 0x7;
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
-            if (id == 0x704 || id == 0x782)
+            if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
@@ -420,6 +431,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        if (res.Kind == ak_STM)
+            res.SrcRegs |= instr & (1 << 15);
+
         if (data & A_SetNZ)
             res.WriteFlags |= flag_N | flag_Z;
         if (data & A_SetCV)
@@ -432,6 +446,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.ReadFlags |= flag_C;
         if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
             res.ReadFlags |= flag_C;
+        if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
+            res.WriteFlags |= flag_C;
 
         if ((instr >> 28) < 0xE)
         {
-- 
cgit v1.2.3


From 5338c28f408382263077b24bce5d5ab62bdf7024 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 8 Sep 2019 14:48:20 +0200
Subject: load register only if needed - do thumb bl long merge in the first
 step - preparations for better branch jitting

---
 src/ARMJIT.cpp                     | 16 ++++++++++++++++
 src/ARMJIT.h                       |  1 +
 src/ARMJIT_RegisterCache.h         | 12 ++++++++----
 src/ARMJIT_x64/ARMJIT_Branch.cpp   | 12 +++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 34 ++++++++++++----------------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  2 +-
 src/ARM_InstrInfo.h                |  3 +++
 7 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 5d92e47..85cadf3 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -159,6 +159,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     u32 r15 = cpu->R[15];
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+	u32 nextInstrAddr[2] = {blockAddr, r15};
     do
     {
         r15 += thumb ? 2 : 4;
@@ -166,6 +167,10 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+	
+		instrs[i].Addr = nextInstrAddr[0];
+		nextInstrAddr[0] = nextInstrAddr[1];
+		nextInstrAddr[1] = r15;
 
         if (cpu->Num == 0)
         {
@@ -193,8 +198,19 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
+			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
+		{
+			instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG;
+			instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16);
+			instrs[i - 1].Info.DstRegs = 0xC000;
+			instrs[i - 1].Info.SrcRegs = 0;
+			instrs[i - 1].Info.EndBlock = true;
+			i--;
+		}
         i++;
 
+
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
 		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
 			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 6197695..7e448ef 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -31,6 +31,7 @@ struct FetchedInstr
 	u8 SetFlags;
     u32 Instr;
     u32 NextInstr[2];
+	u32 Addr;
 
     u8 CodeCycles;
 
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 04c1eda..fe2f203 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -38,7 +38,7 @@ public:
         Mapping[reg] = (Reg)-1;
     }
 
-    void LoadRegister(int reg)
+    void LoadRegister(int reg, bool loadValue)
     {
         assert(Mapping[reg] == -1);
         for (int i = 0; i < NativeRegsAvailable; i++)
@@ -50,7 +50,8 @@ public:
                 NativeRegsUsed |= 1 << (int)nativeReg;
                 LoadedRegs |= 1 << reg;
 
-                Compiler->LoadReg(reg, nativeReg);
+                if (loadValue)
+                    Compiler->LoadReg(reg, nativeReg);
 
                 return;
             }
@@ -66,7 +67,7 @@ public:
             UnloadRegister(reg);
     }
 
-	void Prepare(int i)
+	void Prepare(bool thumb, int i)
     {
         u16 futureNeeded = 0;
         int ranking[16];
@@ -111,8 +112,11 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            BitSet16 needValueLoaded(needToBeLoaded);
+            if (thumb || Instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
-                LoadRegister(reg);
+                LoadRegister(reg, needValueLoaded[reg]);
         }
         DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index c0a8f1f..cc7a3c4 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -271,15 +271,17 @@ void Compiler::T_Comp_BL_LONG_2()
     Comp_JumpTo(RSCRATCH);
 }
 
-void Compiler::T_Comp_BL_Merged(FetchedInstr part1)
+void Compiler::T_Comp_BL_Merged()
 {
-    assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1);
     Comp_AddCycles_C();
 
-    u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9);
-    target += (CurInstr.Instr & 0x7FF) << 1;
+    R15 += 2;
 
-    if (Num == 1 || CurInstr.Instr & (1 << 12))
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
         target |= 1;
 
     MOV(32, MapReg(14), Imm32((R15 - 2) | 1));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d585f39..d8ce1aa 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -338,7 +338,8 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
     // Branch
     F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
     // Unk, SVC
-    NULL, NULL
+    NULL, NULL,
+    F(T_Comp_BL_Merged)
 };
 #undef F
 
@@ -361,21 +362,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ConstantCycles = 0;
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
-    R15 = cpu->R[15];
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
     if (!(Num == 0 
-        ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(R15 - (Thumb ? 2 : 4))))
+        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
+        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
     {
         printf("Trying to compile a block in unmapped memory\n");
     }
 
-    bool mergedThumbBL = false;
-
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
@@ -387,8 +385,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     for (int i = 0; i < instrsCount; i++)
     {
-        R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
@@ -406,29 +404,21 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         }
         
         if (comp != NULL)
-            RegCache.Prepare(i);
+            RegCache.Prepare(Thumb, i);
         else
             RegCache.Flush();
 
         if (Thumb)
         {
-            if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1
-                && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2)
-                mergedThumbBL = true;
-            else
+            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
+            if (comp == NULL)
             {
-                u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
-                if (comp == NULL)
-                {
-                    MOV(64, R(ABI_PARAM1), R(RCPU));
+                MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
-                }
-                else if (mergedThumbBL)
-                    T_Comp_BL_Merged(instrs[i - 1]);
-                else
-                    (this->*comp)();
+                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
+            else
+                (this->*comp)();
         }
         else
         {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a62f043..fcb2380 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -90,7 +90,7 @@ public:
     void T_Comp_BranchXchangeReg();
     void T_Comp_BL_LONG_1();
     void T_Comp_BL_LONG_2();
-    void T_Comp_BL_Merged(FetchedInstr prefix);
+    void T_Comp_BL_Merged();
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 5336837..d01c600 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -212,6 +212,9 @@ enum
     tk_UNK,
     tk_SVC,
 
+    // not a real instruction
+    tk_BL_LONG,
+
     tk_Count
 };
 
-- 
cgit v1.2.3


From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:10:59 +0200
Subject: new block cache and much more... - more reliable code invalidation
 detection - blocks aren't stopped at any branch, but are being followed if
 possible to get larger blocks - idle loop recognition - optimised literal
 loads, load/store cycle counting  and loads/stores from constant addresses

---
 src/ARM.cpp                         |  44 ++-
 src/ARM.h                           |  16 +-
 src/ARMInterpreter.h                |   9 +
 src/ARMJIT.cpp                      | 755 ++++++++++++++++++++++++++++++------
 src/ARMJIT.h                        | 141 ++-----
 src/ARMJIT_Internal.h               | 198 ++++++++++
 src/ARMJIT_RegisterCache.h          |  36 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  16 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 184 +++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  51 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++----------------
 src/ARM_InstrInfo.cpp               |  47 ++-
 src/ARM_InstrInfo.h                 |  11 +-
 src/CP15.cpp                        |  12 +-
 src/Config.cpp                      |   2 +
 src/Config.h                        |   1 +
 src/NDS.cpp                         |  22 +-
 src/libui_sdl/DlgEmuSettings.cpp    |  22 +-
 19 files changed, 1550 insertions(+), 689 deletions(-)
 create mode 100644 src/ARMJIT_Internal.h

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index e404943..423c940 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
 
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
             {
                 NDS::ARM9Timestamp = NDS::ARM9Target;
             }
             break;
         }
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT()
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
 
         // TODO optimize this shit!!!
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
             {
                 NDS::ARM7Timestamp = NDS::ARM7Target;
             }
             break;
         }
-
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT()
 
 void ARMv5::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         if ((R[15] - 2) & 0x2)
@@ -758,6 +770,8 @@ void ARMv5::FillPipeline()
 
 void ARMv4::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         NextInstr[0] = CodeRead16(R[15] - 2);
diff --git a/src/ARM.h b/src/ARM.h
index 4d387bc..8a01068 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -299,7 +299,7 @@ public:
     {
         *val = NDS::ARM7Read8(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead16(u32 addr, u32* val)
@@ -308,7 +308,7 @@ public:
 
         *val = NDS::ARM7Read16(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead32(u32 addr, u32* val)
@@ -317,7 +317,7 @@ public:
 
         *val = NDS::ARM7Read32(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataRead32S(u32 addr, u32* val)
@@ -325,14 +325,14 @@ public:
         addr &= ~3;
 
         *val = NDS::ARM7Read32(addr);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
     void DataWrite8(u32 addr, u8 val)
     {
         NDS::ARM7Write8(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite16(u32 addr, u16 val)
@@ -341,7 +341,7 @@ public:
 
         NDS::ARM7Write16(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite32(u32 addr, u32 val)
@@ -350,7 +350,7 @@ public:
 
         NDS::ARM7Write32(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataWrite32S(u32 addr, u32 val)
@@ -358,7 +358,7 @@ public:
         addr &= ~3;
 
         NDS::ARM7Write32(addr, val);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
 
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index 7244238..2bf8167 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -28,6 +28,15 @@ namespace ARMInterpreter
 extern void (*ARMInstrTable[4096])(ARM* cpu);
 extern void (*THUMBInstrTable[1024])(ARM* cpu);
 
+void A_MSR_IMM(ARM* cpu);
+void A_MSR_REG(ARM* cpu);
+void A_MRS(ARM* cpu);
+void A_MCR(ARM* cpu);
+void A_MRC(ARM* cpu);
+void A_SVC(ARM* cpu);
+
+void T_SVC(ARM* cpu);
+
 void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
 
 }
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 85cadf3..686bdd6 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,122 +1,137 @@
 #include "ARMJIT.h"
 
 #include <string.h>
+#include <assert.h>
 
 #include "Config.h"
 
+#include "ARMJIT_Internal.h"
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
+#include "ARMInterpreter_ALU.h"
+#include "ARMInterpreter_LoadStore.h"
+#include "ARMInterpreter_Branch.h"
+#include "ARMInterpreter.h"
+
+#include "GPU3D.h"
+#include "SPU.h"
+#include "Wifi.h"
+
 namespace ARMJIT
 {
 
+#define JIT_DEBUGPRINT(msg, ...)
+
 Compiler* compiler;
-BlockCache cache;
 
-#define DUP2(x) x, x
+const u32 ExeMemRegionSizes[] = {
+	0x8000,			// Unmapped Region (dummy)
+	0x8000, 		// ITCM
+	4*1024*1024, 	// Main RAM
+	0x8000, 		// SWRAM
+	0xA4000, 		// LCDC
+	0x8000, 		// ARM9 BIOS
+	0x4000, 		// ARM7 BIOS
+	0x10000,		// ARM7 WRAM
+	0x40000			// ARM7 WVRAM
+};
 
-static ptrdiff_t JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
-		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/		 -1, 
-					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
-		/* 1X*/	DUP2(-1),
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	     offsetof(BlockCache, SWRAM),
-		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(-1)
-		}
+const u32 ExeMemRegionOffsets[] = {
+	0,
+	0x8000,
+	0x10000,
+	0x410000,
+	0x418000,
+	0x4BC000,
+	0x4C4000,
+	0x4C8000,
+	0x4D8000,
+	0x518000,
 };
 
-static u32 JIT_MASK[2][32] = {
+#define DUP2(x) x, x
+
+const static ExeMemKind JIT_MEM[2][32] = {
 	//arm9
 	{
-		/* 0X*/	DUP2(0x00007FFF),
-		/* 1X*/	DUP2(0x00007FFF),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	DUP2(0x00007FFF),
-		/* 4X*/	DUP2(0x00000000),
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/		 0x00000000,
-					 0x000FFFFF,
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00007FFF)
+		/* 0X*/	DUP2(exeMem_ITCM),
+		/* 1X*/	DUP2(exeMem_ITCM), // mirror
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	DUP2(exeMem_SWRAM),
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/		 exeMem_Unmapped, 
+					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_ARM9_BIOS)
 	},
 	//arm7
 	{
-		/* 0X*/	DUP2(0x00003FFF),
-		/* 1X*/	DUP2(0x00000000),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	     0x00007FFF,
-		             0x0000FFFF,
-		/* 4X*/	     0x00000000,
-		             0x0000FFFF,
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/ DUP2(0x0003FFFF),
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00000000)
+		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
+		/* 1X*/	DUP2(exeMem_Unmapped),
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	     exeMem_SWRAM,
+		             exeMem_ARM7_WRAM,
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_Unmapped)
 		}
 };
 
 #undef DUP2
 
+/*
+	translates address to pseudo physical address
+		- more compact, eliminates mirroring, everything comes in a row
+		- we only need one translation table
+*/
+u32 AddrTranslate9[0x2000];
+u32 AddrTranslate7[0x4000];
 
-void Init()
+JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
+AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+TinyVector<JitBlock*> JitBlocks;
+JitBlock* RestoreCandidates[0x1000] = {NULL};
+
+u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
 {
-    memset(&cache, 0, sizeof(BlockCache));
+	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
+}
 
+void Init()
+{
 	for (int i = 0; i < 0x2000; i++)
-		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
-			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[0][i >> 8];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
+	}
 	for (int i = 0; i < 0x4000; i++)
-		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
-			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[1][i >> 9];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
+	}
 
 	compiler = new Compiler();
 }
@@ -126,7 +141,7 @@ void DeInit()
 	delete compiler;
 }
 
-void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 {
 	for (int j = start; j >= 0; j--)
 	{
@@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+{
+	if (thumb)
+	{
+		u32 r15 = instr.Addr + 4;
+		cond = 0xE;
+
+		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
+		{
+			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
+    		targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_B)
+		{
+			s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
+		{
+			cond = (instr.Instr >> 8) & 0xF;
+			s32 offset = (s32)(instr.Instr << 24) >> 23;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	else
+	{
+		cond = instr.Cond();
+		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
+			|| instr.Info.Kind == ARMInstrInfo::ak_B)
+		{
+			s32 offset = (s32)(instr.Instr << 8) >> 6;
+			u32 r15 = instr.Addr + 8;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
+{
+	// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
+	// it basically checks if one iteration of a loop depends on another
+	// the rules are quite simple
+
+	u16 regsWrittenTo = 0;
+	u16 regsDisallowedToWrite = 0;
+	for (int i = 0; i < instrsCount; i++)
+	{
+		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
+			return false;
+		if (i < instrsCount - 1 && instrs[i].Info.Branches())
+			return false;
+
+		u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
+		u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
+
+		regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
+		
+		if (dstRegs & regsDisallowedToWrite)
+			return false;
+		regsWrittenTo |= dstRegs;
+	}
+	return true;
+}
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+
+#define F(x) &ARMInterpreter::A_##x
+#define F_ALU(name, s) \
+	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
+	F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
+#define F_MEM_WB(name) \
+	F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
+	F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
+#define F_MEM_HD(name) \
+	F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
+InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
+{
+	F_ALU(AND,), F_ALU(AND,_S),
+	F_ALU(EOR,), F_ALU(EOR,_S),
+	F_ALU(SUB,), F_ALU(SUB,_S),
+	F_ALU(RSB,), F_ALU(RSB,_S),
+	F_ALU(ADD,), F_ALU(ADD,_S),
+	F_ALU(ADC,), F_ALU(ADC,_S),
+	F_ALU(SBC,), F_ALU(SBC,_S),
+	F_ALU(RSC,), F_ALU(RSC,_S),
+	F_ALU(ORR,), F_ALU(ORR,_S),
+	F_ALU(MOV,), F_ALU(MOV,_S),
+	F_ALU(BIC,), F_ALU(BIC,_S),
+	F_ALU(MVN,), F_ALU(MVN,_S),
+	F_ALU(TST,),
+	F_ALU(TEQ,),
+	F_ALU(CMP,),
+	F_ALU(CMN,),
+
+	F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
+	F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
+
+	F_MEM_WB(STR),
+	F_MEM_WB(STRB),
+	F_MEM_WB(LDR),
+	F_MEM_WB(LDRB),
+
+	F_MEM_HD(STRH),
+	F_MEM_HD(LDRD),
+	F_MEM_HD(STRD),
+	F_MEM_HD(LDRH),
+	F_MEM_HD(LDRSB),
+	F_MEM_HD(LDRSH),
+
+	F(SWP), F(SWPB),
+	F(LDM), F(STM),
+
+	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+};
+#undef F_ALU
+#undef F_MEM_WB
+#undef F_MEM_HD
+#undef F
+
+#define F(x) ARMInterpreter::T_##x
+InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
+{
+	F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
+	F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
+	F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
+	F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
+	F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
+	F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
+	F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
+	F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
+	F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
+	F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
+	F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
+	F(PUSH), F(POP), F(LDMIA), F(STMIA),
+	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
+	F(UNK), F(SVC), 
+	NULL // BL_LONG psudo opcode
+};
+#undef F
+
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu)
 	if (Config::JIT_MaxBlockSize > 32)
 		Config::JIT_MaxBlockSize = 32;
 
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
+    if (!(cpu->Num == 0 
+        ? IsMapped<0>(blockAddr)
+        : IsMapped<1>(blockAddr)))
+    {
+        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
+    }
+	
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr<0>(blockAddr)
+			: TranslateAddr<1>(blockAddr);
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+
+	u32 addresseRanges[32] = {};
+	u32 numAddressRanges = 0;
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
+
+	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
+		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+
+	u32 lastSegmentStart = blockAddr;
+
     do
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
@@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
 		nextInstrAddr[1] = r15;
+		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
+
+		u32 translatedAddr = (cpu->Num == 0
+			? TranslateAddr<0>(instrs[i].Addr)
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		{
+			bool returning = false;
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (addresseRanges[j] == translatedAddr)
+				{
+					returning = true;
+					break;
+				}
+			}
+			if (!returning)
+				addresseRanges[numAddressRanges++] = translatedAddr;
+		}
 
         if (cpu->Num == 0)
         {
@@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		cpu->R[15] = r15;
+		cpu->CurInstr = instrs[i].Instr;
+		cpu->CodeCycles = instrs[i].CodeCycles;
+
+		if (thumb)
+		{
+			InterpretTHUMB[instrs[i].Info.Kind](cpu);
+		}
+		else
+		{
+			if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+			{
+				ARMInterpreter::A_BLX_IMM(cpu);
+			}
+			else
+			{
+                u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				if (cpu->CheckCondition(instrs[i].Cond()))
+					InterpretARM[instrs[i].Info.Kind](cpu);
+				else
+					cpu->AddCycles_C();
+			}
+		}
+
+		instrs[i].DataCycles = cpu->DataCycles;
+		instrs[i].DataRegion = cpu->DataRegion;
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu)
 			instrs[i - 1].Info.EndBlock = true;
 			i--;
 		}
-        i++;
 
+		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		{
+			bool hasBranched = cpu->R[15] != r15;
+
+			u32 cond, target;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
+
+			if (staticBranch)
+			{
+				bool isBackJump = false;
+				if (hasBranched)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						if (instrs[i].Addr == target)
+						{
+							isBackJump = true;
+							break;
+						}
+					}
+				}
+
+				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
+				{
+					// we might have an idle loop
+					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
+					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					{
+						instrs[i].BranchFlags |= branch_IdleBranch;
+						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
+					}
+				}
+				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				{
+					u32 targetPseudoPhysical = cpu->Num == 0
+						? TranslateAddr<0>(target)
+						: TranslateAddr<1>(target);
+					
+					r15 = target + (thumb ? 2 : 4);
+					assert(r15 == cpu->R[15]);
+
+					JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
+
+					nextInstr[0] = cpu->NextInstr[0];
+					nextInstr[1] = cpu->NextInstr[1];
+
+					nextInstrAddr[0] = target;
+					nextInstrAddr[1] = r15;
+
+					lastSegmentStart = target;
+
+					instrs[i].Info.EndBlock = false;
+
+					if (cond < 0xE)
+						instrs[i].BranchFlags |= branch_FollowCondTaken;
+				}
+			}
+
+			if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
+			{
+				instrs[i].Info.EndBlock = false;
+				instrs[i].BranchFlags |= branch_FollowCondNotTaken;
+			}
+		}
+
+        i++;
 
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
-		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
-			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
+		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
+		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
+			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
 
-	floodFillSetFlags(instrs, i - 1, 0xF);
+	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	bool mayRestore = true;
+	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	{
+		RestoreCandidates[restoreSlot] = NULL;	
+		if (prevBlock->NumInstrs == i)
+		{
+			for (int j = 0; j < i; j++)
+			{
+				if (prevBlock->Instrs()[j] != instrs[j].Instr)
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
 
-    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+		if (prevBlock->NumAddresses == numAddressRanges)
+		{
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
+	}
+	else
+	{
+		mayRestore = false;
+		prevBlock = NULL;
+	}
 
-	if (cpu->Num == 0)
-    	InsertBlock<0>(blockAddr, block);
+	JitBlock* block;
+	if (!mayRestore)
+	{
+		if (prevBlock)
+			delete prevBlock;
+
+		block = new JitBlock(i, numAddressRanges);
+		for (int j = 0; j < i; j++)
+			block->Instrs()[j] = instrs[j].Instr;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addresseRanges[j];
+
+		block->StartAddr = blockAddr;
+		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+
+		FloodFillSetFlags(instrs, i - 1, 0xF);
+
+		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+	}
 	else
-    	InsertBlock<1>(blockAddr, block);
+	{
+		JIT_DEBUGPRINT("restored! %p\n", prevBlock);
+		block = prevBlock;
+	}
+
+	for (int j = 0; j < numAddressRanges; j++)
+	{
+		assert(addresseRanges[j] == block->AddressRanges()[j]);
+		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+	}
+
+	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
 
-	return block;
+	JitBlocks.Add(block);
 }
 
-void InvalidateBlockCache()
+void InvalidateByAddr(u32 pseudoPhysical)
 {
-	printf("Resetting JIT block cache...\n");
+	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	int startLength = range->Blocks.Length;
+	for (int i = 0; i < range->Blocks.Length; i++)
+	{
+		assert(range->Blocks.Length == startLength);
+		JitBlock* block = range->Blocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			if ((addr / 256) != (pseudoPhysical / 256))
+			{
+				AddressRange* otherRange = &CodeRanges[addr / 256];
+				assert(otherRange != range);
+				assert(otherRange->Blocks.RemoveByValue(block));
+			}
+		}
+
+		assert(JitBlocks.RemoveByValue(block));
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+
+		RestoreCandidates[slot] = block;
+	}
+	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
+		range->TimesInvalidated++;
+	
+	range->Blocks.Clear();
+}
+
+void InvalidateByAddr7(u32 addr)
+{
+	u32 pseudoPhysical = TranslateAddr<1>(addr);
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateITCM(u32 addr)
+{
+	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
+	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateAll()
+{
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeRanges[addr / 256];
+			range->Blocks.Clear();
+			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
+				range->TimesInvalidated++;
+		}
+
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+		
+		RestoreCandidates[slot] = block;
+	}
+
+	JitBlocks.Clear();
+}
+
+void ResetBlockCache()
+{
+	printf("Resetting JIT block cache...\n");
+	
+	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
+	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+	{
+		if (RestoreCandidates[i])
+		{
+			delete RestoreCandidates[i];
+			RestoreCandidates[i] = NULL;
+		}
+	}
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 256].Blocks.Clear();
+			CodeRanges[addr / 256].TimesInvalidated = 0;
+		}
+		delete block;
+	}
+	JitBlocks.Clear();
 
 	compiler->Reset();
 }
 
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		if ((addr & 0xFF000000) == 0x04000000)
+		{
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;		
+			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size == 16)
+			{
+				if (store)
+					return (void*)Wifi::Write;
+				else
+					return (void*)Wifi::Read;
+			}
+			break;
+		}
+	}
+	return NULL;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 7e448ef..1db4d66 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,142 +9,67 @@
 namespace ARMJIT
 {
 
-typedef u32 (*CompiledBlock)();
-
-struct FetchedInstr
+enum ExeMemKind
 {
-    u32 A_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0xF;
-    }
-
-    u32 T_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0x7;
-    }
-
-    u32 Cond() const
-    {
-        return Instr >> 28;
-    }
-
-	u8 SetFlags;
-    u32 Instr;
-    u32 NextInstr[2];
-	u32 Addr;
-
-    u8 CodeCycles;
-
-    ARMInstrInfo::Info Info;
+	exeMem_Unmapped = 0,
+	exeMem_ITCM,
+	exeMem_MainRAM,
+	exeMem_SWRAM,
+	exeMem_LCDC,
+	exeMem_ARM9_BIOS,
+	exeMem_ARM7_BIOS,
+	exeMem_ARM7_WRAM,
+	exeMem_ARM7_WVRAM,
+	exeMem_Count
 };
 
-/* 
-	Copied from DeSmuME
-	Some names where changed to match the nomenclature of melonDS
+extern const u32 ExeMemRegionOffsets[];
+extern const u32 ExeMemRegionSizes[];
 
-	Since it's nowhere explained and atleast I needed some time to get behind it,
-	here's a summary on how it works:
-		more or less all memory locations from which code can be executed are
-		represented by an array of function pointers, which point to null or
-		a function which executes a block instructions starting from there.
+typedef u32 (*JitBlockEntry)();
 
-		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
-		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
-		are the sizes of the smallest contigous memory region mapped to the respective CPU.
-		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
-		we only need every second half word to be adressable.
+extern u32 AddrTranslate9[0x2000];
+extern u32 AddrTranslate7[0x4000];
 
-		In case a memory write hits mapped memory, the function block at this
-		address is set to null, so it's recompiled the next time it's executed.
-
-		This method has disadvantages, namely that only writing to the
-		first instruction of a block marks it as invalid and that memory remapping
-        (SWRAM and VRAM) isn't taken into account.
-*/
-
-struct BlockCache
-{
-    CompiledBlock* AddrMapping9[0x2000] = {0};
-    CompiledBlock* AddrMapping7[0x4000] = {0};
-
-    CompiledBlock MainRAM[4*1024*1024/2];
-	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
-	CompiledBlock ARM9_ITCM[0x8000/2];
-	CompiledBlock ARM9_LCDC[0xA4000/2];
-	CompiledBlock ARM9_BIOS[0x8000/2];
-	CompiledBlock ARM7_BIOS[0x4000/2];
-	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
-};
-
-extern BlockCache cache;
+const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
+extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
 }
 
 template <u32 num>
-inline CompiledBlock LookUpBlock(u32 addr)
+inline u32 TranslateAddr(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
 template <u32 num>
-inline void Invalidate16(u32 addr)
+inline JitBlockEntry LookUpBlock(u32 addr)
 {
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
-		else
-			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
-	}
-}
-
-template <u32 num>
-inline void Invalidate32(u32 addr)
-{
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-		{
-			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
-			page[(addr & 0x7FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
-		}
-		else
-		{
-			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
-			page[(addr & 0x3FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
-		}
-	}
-}
-
-template <u32 num>
-inline void InsertBlock(u32 addr, CompiledBlock func)
-{
-	if (num == 0)
-		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
-	else
-		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
 }
 
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateAll();
+
+void InvalidateITCM(u32 addr);
+void InvalidateByAddr7(u32 addr);
+
+void CompileBlock(ARM* cpu);
 
-void InvalidateBlockCache();
+void ResetBlockCache();
 
 }
 
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
new file mode 100644
index 0000000..4acb488
--- /dev/null
+++ b/src/ARMJIT_Internal.h
@@ -0,0 +1,198 @@
+#ifndef ARMJIT_INTERNAL_H
+#define ARMJIT_INTERNAL_H
+
+#include "types.h"
+#include <stdint.h>
+
+#include "ARMJIT.h"
+
+// here lands everything which doesn't fit into ARMJIT.h
+// where it would be included by pretty much everything
+namespace ARMJIT
+{
+
+enum
+{
+	branch_IdleBranch = 1 << 0,
+	branch_FollowCondTaken = 1 << 1,
+	branch_FollowCondNotTaken = 1 << 2
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+	u8 BranchFlags;
+	u8 SetFlags;
+    u32 Instr;
+    u32 NextInstr[2];
+	u32 Addr;
+
+    u8 CodeCycles;
+	u8 DataCycles;
+	u8 DataRegion;
+
+    ARMInstrInfo::Info Info;
+};
+
+/*
+	TinyVector
+		- because reinventing the wheel is the best!
+	
+	- meant to be used very often, with not so many elements
+	max 1 << 16 elements
+	- doesn't allocate while no elements are inserted
+	- not stl confirmant of course
+	- probably only works with POD types
+	- remove operations don't preserve order, but O(1)!
+*/
+template <typename T>
+struct __attribute__((packed)) TinyVector
+{
+	T* Data = NULL;
+	u16 Capacity = 0;
+	u32 Length = 0; // make it 32 bit so we don't need movzx
+
+	~TinyVector()
+	{
+		delete[] Data;
+	}
+
+	void MakeCapacity(u32 capacity)
+	{
+		assert(capacity <= UINT16_MAX);
+		assert(capacity > Capacity);
+		T* newMem = new T[capacity];
+		if (Data != NULL)
+			memcpy(newMem, Data, sizeof(Data) * Length);
+
+		T* oldData = Data;
+		Data = newMem;
+		if (oldData != NULL)
+			delete[] oldData;
+		
+		Capacity = capacity;
+	}
+
+	void Clear()
+	{
+		Length = 0;
+	}
+
+	void Add(T element)
+	{
+		assert(Length + 1 <= UINT16_MAX);
+		if (Length + 1 > Capacity)
+			MakeCapacity(((Capacity + 4) * 3) / 2);
+		
+		Data[Length++] = element;
+	}
+
+	void Remove(int index)
+	{
+		assert(index >= 0 && index < Length);
+
+		Length--;
+		Data[index] = Data[Length];
+		/*for (int i = index; i < Length; i++)
+			Data[i] = Data[i + 1];*/
+	}
+
+	int Find(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+				return i;
+		}
+		return -1;
+	}
+
+	bool RemoveByValue(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+			{
+				Remove(i);
+				return true;
+			}
+		}
+		return false;
+	}
+
+	T& operator[](int index)
+	{
+		assert(index >= 0 && index < Length);
+		return Data[index];
+	}
+};
+
+class JitBlock
+{
+public:
+	JitBlock(u32 numInstrs, u32 numAddresses)
+	{
+		NumInstrs = numInstrs;
+		NumAddresses = numAddresses;
+		Data = new u32[numInstrs + numAddresses];
+	}
+
+	~JitBlock()
+	{
+		delete[] Data;
+	}
+
+	u32 StartAddr;
+	u32 PseudoPhysicalAddr;
+	
+	u32 NumInstrs;
+	u32 NumAddresses;
+
+	JitBlockEntry EntryPoint;
+
+	u32* Instrs()
+	{ return Data; }
+	u32* AddressRanges()
+	{ return Data + NumInstrs; }
+
+private:
+	/*
+		0..<NumInstrs - the instructions of the block
+		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
+			(atleast one, the pseudo physical address of the block)
+	*/
+	u32* Data;
+};
+
+// size should be 16 bytes because I'm to lazy to use mul and whatnot
+struct __attribute__((packed)) AddressRange
+{
+	TinyVector<JitBlock*> Blocks;
+	u16 TimesInvalidated;
+};
+
+extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+extern InterpreterFunc InterpretARM[];
+extern InterpreterFunc InterpretTHUMB[];
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index fe2f203..ed6a2b7 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -60,15 +60,46 @@ public:
         assert("Welp!");
     }
 
+    void PutLiteral(int reg, u32 val)
+    {
+        LiteralsLoaded |= (1 << reg);
+        LiteralValues[reg] = val;
+    }
+
+    void UnloadLiteral(int reg)
+    {
+        LiteralsLoaded &= ~(1 << reg);
+    }
+
+    bool IsLiteral(int reg)
+    {
+        return LiteralsLoaded & (1 << reg);
+    }
+
+    void PrepareExit()
+    {
+        BitSet16 dirtyRegs(DirtyRegs);
+        for (int reg : dirtyRegs)
+            Compiler->SaveReg(reg, Mapping[reg]);
+    }
+
     void Flush()
     {
         BitSet16 loadedSet(LoadedRegs);
         for (int reg : loadedSet)
             UnloadRegister(reg);
+        LiteralsLoaded = 0;
     }
 
 	void Prepare(bool thumb, int i)
     {
+        if (LoadedRegs & (1 << 15))
+            UnloadRegister(15);
+
+        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        for (int reg : invalidedLiterals)
+            UnloadLiteral(reg);
+
         u16 futureNeeded = 0;
         int ranking[16];
         for (int j = 0; j < 16; j++)
@@ -86,7 +117,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-		FetchedInstr Instr = Instrs[i];
+        FetchedInstr Instr = Instrs[i];
         u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -125,6 +156,9 @@ public:
 	static const int NativeRegsAvailable;
 
 	Reg Mapping[16];
+    u32 LiteralValues[16];
+
+    u16 LiteralsLoaded = 0;
 	u32 NativeRegsUsed = 0;
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f868ddf..14c223b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp()
         MOV(32, rd, op2);
 
     if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
+    {
         NOT(32, rd);
+        if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
+    }
+    else if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
 
     if (S)
     {
@@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_()
     
     Comp_AddCycles_C();
 
-    if (op & 1)
+    // special case for thumb mov being alias to add rd, rn, #0
+    if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
+    {
+        if (rd != rs)
+            MOV(32, rd, rs);
+    }
+    else if (op & 1)
         Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
         Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
@@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU()
     u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
-        Comp_AddCycles_CI(1);
+        Comp_AddCycles_CI(1); // shift by reg
     else
         Comp_AddCycles_C();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cc7a3c4..0dedb3f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -16,9 +16,6 @@ int squeezePointer(T* ptr)
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
     // we can simplify constant branches by a lot
-    // it's not completely safe to assume stuff like, which instructions to preload
-    // we'll see how it works out
-
     IrregularCycles = true;
 
     u32 newPC;
@@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     {
         ARMv5* cpu9 = (ARMv5*)CurCPU;
 
-        u32 oldregion = R15 >> 24;
-        u32 newregion = addr >> 24;
-
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
         u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
-
-        bool setupRegion = newregion != oldregion;
-        if (setupRegion)
-            cpu9->SetupCodeMem(addr);
+        if (Exit)
+            MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
         if (addr & 0x1)
         {
@@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             cycles += cpu9->CodeCycles;
         }
 
-        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
-
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
-        if (setupRegion)
-            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeRegion = codeRegion;
         cpu7->CodeCycles = codeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        if (Exit)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        }
 
         if (addr & 0x1)
         {
@@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = addr >> 15;
     }
 
-    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    if (Exit)
+        MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
@@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
+    Comp_SpecialBranchBehaviour();
+
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+
     Comp_AddCycles_C(true);
-   SetJumpTarget(skipFailed);
+    SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d8ce1aa..25c55a3 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -72,12 +72,15 @@ Compiler::Compiler()
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
-        {
             MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
-            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
-        }
     }
+    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
+    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
+    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
+    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
+    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
+    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
+
     for (int i = 0; i < 2; i++)
         for (int j = 0; j < 2; j++)
         {
@@ -179,12 +182,13 @@ void Compiler::LoadCPSR()
     MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
 }
 
-void Compiler::SaveCPSR()
+void Compiler::SaveCPSR(bool flagClean)
 {
     if (CPSRDirty)
     {
         MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
-        CPSRDirty = false;
+        if (flagClean)
+            CPSRDirty = false;
     }
 }
 
@@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
 // invalidates RSCRATCH and RSCRATCH3
 Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
+    // hack, ldm/stm can get really big TODO: make this better
+    bool ldmStm = !Thumb &&
+        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
     if (cond >= 0x8)
     {
         static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
@@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
         SHL(32, R(RSCRATCH), R(RSCRATCH3));
         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
 
-        return J_CC(CC_Z);
+        return J_CC(CC_Z, ldmStm);
     }
     else
     {
         // could have used a LUT, but then where would be the fun?
         TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
     }
 }
 
@@ -354,25 +361,34 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+void Compiler::Comp_SpecialBranchBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
-        InvalidateBlockCache();
+        ResetBlockCache();
 
     ConstantCycles = 0;
-    Thumb = cpu->CPSR & 0x20;
+    Thumb = thumb;
     Num = cpu->Num;
-    CodeRegion = cpu->CodeRegion;
+    CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
-    if (!(Num == 0 
-        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
-    {
-        printf("Trying to compile a block in unmapped memory\n");
-    }
+    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
@@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
-    // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
+        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
         bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
-        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-
             if (comp == NULL)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
+
                 SaveCPSR();
+            }
         }
-        
+
         if (comp != NULL)
             RegCache.Prepare(Thumb, i);
         else
@@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
             }
             else
                 (this->*comp)();
@@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 }
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+            }
             else
             {
                 IrregularCycles = false;
@@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
 
-                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
                 }
                 else
                     (this->*comp)();
 
+                Comp_SpecialBranchBehaviour();
+
                 if (CurInstr.Cond() < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipFailed = J();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C(true);
 
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            RegCache.PrepareExit();
+                            SaveCPSR(false);
+                            
+                            MOV(32, R(RAX), Imm32(ConstantCycles));
+                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+                            RET();
+                        }
+
                         SetJumpTarget(skipFailed);
                     }
                     else
@@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
+    /*FILE* codeout = fopen("codeout", "a");
+    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
+    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
+
+    fclose(codeout);*/
+
     return res;
 }
 
@@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
 }
 
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
+
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if (!Thumb && CurInstr.Cond() < 0xE)
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index fcb2380..792ff66 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,6 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
+#include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
 namespace ARMJIT
@@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+struct ComplexOperand
+{
+    ComplexOperand()
+    {}
+
+    ComplexOperand(u32 imm)
+        : IsImm(true), Imm(imm)
+    {}
+    ComplexOperand(int reg, int op, int amount)
+        : IsImm(false)
+    {
+        Reg.Reg = reg;
+        Reg.Op = op;
+        Reg.Amount = amount;
+    }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            int Reg, Op, Amount;
+        } Reg;
+        u32 Imm;
+    };
+};
 
 class Compiler : public Gen::XEmitter
 {
@@ -24,7 +51,7 @@ public:
 
     void Reset();
 
-    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -39,6 +66,8 @@ public:
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
     void Comp_AddCycles_CI(Gen::X64Reg i, int add);
+    void Comp_AddCycles_CDI();
+    void Comp_AddCycles_CD();
 
     enum
     {
@@ -92,8 +121,17 @@ public:
     void T_Comp_BL_LONG_2();
     void T_Comp_BL_Merged();
 
-    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -105,8 +143,9 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void Comp_SpecialBranchBehaviour();
+
     void* Gen_MemoryRoutine9(bool store, int size);
-    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
     void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
@@ -117,10 +156,9 @@ public:
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
-    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
-    void SaveCPSR();
+    void SaveCPSR(bool flagClean = true);
 
     bool FlagsNZRequired()
     { return CurInstr.SetFlags & 0xC; }
@@ -139,10 +177,11 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool Exit;
     bool IrregularCycles;
 
     void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2][2];
+    void* MemoryFuncs7[3][2];
 
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index bf8280d..13ca415 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -27,51 +27,7 @@ int squeezePointer(T* ptr)
 /*
     address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-    code cycles - ABI_PARAM3
 */
-
-#define CALC_CYCLES_9(numC, numD, scratch) \
-    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
-    CMP(32, R(numC), R(numD)); \
-    CMOVcc(32, numD, R(numC), CC_G); \
-    CMP(32, R(numD), R(scratch)); \
-    CMOVcc(32, scratch, R(numD), CC_G); \
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
-#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        LEA(32, scratch, MRegSum(numD, numC)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        if (!store) \
-            ADD(32, R(numC), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        if (!store) \
-            ADD(32, R(numD), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
     FixupBranch insideITCM = J_CC(CC_B);
 
-    // cycle counting!
-    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
-    SHR(32, R(ABI_PARAM4), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
-    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-
     if (store)
     {
         if (size > 8)
@@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
     {
         MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+        
+        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
+        static_assert(sizeof(AddressRange) == 16);
+        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        JMP((u8*)InvalidateByAddr, true);
+        SetJumpTarget(noCode);
     }
     else
     {
@@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
-{
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
-    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
-    if (store)
-    {
-        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
-    }
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM7Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM7Read8, true);
-    }
-
-    return res;
-}
-
 #define MEMORY_SEQ_WHILE_COND \
         if (!store) \
             MOV(32, currentElement, R(EAX));\
@@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     ABI_PARAM1 address
     ABI_PARAM2 address where registers are stored
     ABI_PARAM3 how many values to read/write
-    ABI_PARAM4 code cycles
 
     Dolphin x64CodeEmitter is my favourite assembler
  */
 void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
-
-    FixupBranch finishIt1 = J();
+    RET();
 
     SetJumpTarget(insideDTCM);
     AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
@@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
-    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
-    FixupBranch finishIt2 = J();
+    RET();
 
     SetJumpTarget(insideITCM);
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
@@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     {
         MOV(32, R(ABI_PARAM4), currentElement);
         MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+
+        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
+        CALL((u8*)InvalidateByAddr);
+        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        SetJumpTarget(noCode);
     }
     else
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1));
-    MOV(32, R(ABI_PARAM2), Imm32(1));
-
-    SetJumpTarget(finishIt1);
-    SetJumpTarget(finishIt2);
-
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
-
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
-
-    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
     RET();
 
     return res;
@@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
 void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+    RET();
 
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
+    return res;
+}
 
-    // TODO: optimise this
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
+#undef MEMORY_SEQ_WHILE_COND
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+        CurCPU->DataRead16(addr & ~0x1, &val);
+    else
+        CurCPU->DataRead8(addr, &val);
+    CurCPU->R[15] = tmpR15;
 
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+    MOV(32, MapReg(rd), Imm32(val));
 
-    return res;
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+
+    Comp_AddCycles_CDI();
 }
 
-#undef CALC_CYCLES_9
-#undef MEMORY_SEQ_WHILE_COND
+void fault(u32 a, u32 b)
+{
+    printf("actually not static! %x %x\n", a, b);
+}
 
-void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
+void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    IrregularCycles = true;
+    if (flags & memop_Store)
+    {
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-    if (store)
-        MOV(32, R(ABI_PARAM2), rd);
-    u32 cycles = Num
-        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-    MOV(32, R(ABI_PARAM3), Imm32(cycles));
-    CALL(Num == 0
-        ? MemoryFuncs9[size >> 4][store]
-        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
 
-    if (!store)
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
     {
-        if (signExtend)
-            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        Comp_MemLoadLiteral(size, rd, 
+            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+    }
+    else
+    {
+        OpArg rdMapped = MapReg(rd);
+        OpArg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memoryFunc = Num == 0
+            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
+            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
+
+        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
+            MOV(32, R(ABI_PARAM1), Imm32(R15));
+            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            CMP(32, R(RSCRATCH), Imm32(addr));
+            FixupBranch eq = J_CC(CC_E);
+            CALL((void*)fault);
+            SetJumpTarget(eq);*/
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                if (flags & memop_Store)
+                {
+                    MOV(size, M(ptr), MapReg(rd));
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+
+                    if (size == 32 && addr & ~0x3)
+                    {
+                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                    }
+                }
+
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memoryFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        X64Reg finalAddr = ABI_PARAM1;
+        if (flags & memop_Post)
+        {
+            MOV(32, R(ABI_PARAM1), rnMapped);
+
+            finalAddr = rnMapped.GetSimpleReg();
+        }
+
+        if (op2.IsImm)
+        {
+            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+        }
         else
-            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        {
+            OpArg rm = MapReg(op2.Reg.Reg);
+
+            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            {
+                LEA(32, finalAddr, 
+                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+            }
+            else
+            {
+                bool throwAway;
+                OpArg offset =
+                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+                
+                if (flags & memop_SubtractOffset)
+                {
+                    MOV(32, R(finalAddr), rnMapped);
+                    if (!offset.IsZero())
+                        SUB(32, R(finalAddr), offset);
+                }
+                else
+                    MOV_sum(32, finalAddr, rnMapped, offset);
+            }
+        }
+
+        if ((flags & memop_Writeback) && !(flags & memop_Post))
+            MOV(32, rnMapped, R(finalAddr));
+
+        if (flags & memop_Store)
+            MOV(32, R(ABI_PARAM2), rdMapped);
+
+        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
+            MOV(32, rdMapped, R(ABI_PARAM1));
+
+        if (inlinePreparation && size > 8)
+            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+
+        CALL(memoryFunc);
+
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    static_assert(RSCRATCH3 == ECX);
+                    MOV(32, R(ECX), rdMapped);
+                    AND(32, R(ECX), Imm8(3));
+                    SHL(32, R(ECX), Imm8(3));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else if (constLocalROR32 != 0)
+                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
+            }
+
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+
+        if (!(flags & memop_Store) && rd == 15)
+        {
+            if (size < 32)
+                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            {
+                if (Num == 1)
+                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
+                Comp_JumpTo(rdMapped.GetSimpleReg());
+            }
+        }
     }
 }
 
@@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    u32 cycles = Num
-            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-
     // we need to make sure that the stack stays aligned to 16 bytes
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 
-    MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        Comp_AddCycles_CDI();
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        Comp_AddCycles_CD();
+
         if (regsCount & 1)
             PUSH(RSCRATCH);
 
@@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     return offset;
 }
 
-OpArg Compiler::A_Comp_GetMemWBOffset()
-{
-    if (!(CurInstr.Instr & (1 << 25)))
-    {
-        u32 imm = CurInstr.Instr & 0xFFF;
-        return Imm32(imm);
-    }
-    else
-    {
-        int op = (CurInstr.Instr >> 5) & 0x3;
-        int amount = (CurInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurInstr.A_Reg(0));
-        bool carryUsed;
-
-        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
-    }
-}
 
 void Compiler::A_Comp_MemWB()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
     bool load = CurInstr.Instr & (1 << 20);
     bool byte = CurInstr.Instr & (1 << 22);
     int size = byte ? 8 : 32;
+    
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
 
-    if (CurInstr.Instr & (1 << 24))
+    ComplexOperand offset;
+    if (!(CurInstr.Instr & (1 << 25)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
+        offset = ComplexOperand(CurInstr.Instr & 0xFFF);
     }
     else
-        MOV(32, R(ABI_PARAM1), rn);
-
-    if (!(CurInstr.Instr & (1 << 24)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        int rm = CurInstr.A_Reg(0);
 
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
+        offset = ComplexOperand(rm, op, amount);
     }
 
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
-    if (load && CurInstr.A_Reg(12) == 15)
-    {
-        if (byte)
-            printf("!!! LDRB PC %08X\n", R15);
-        else
-        {
-            if (Num == 1)
-                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
-            Comp_JumpTo(rd.GetSimpleReg());
-        }
-    }
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::A_Comp_MemHalf()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
-
-    OpArg offset = CurInstr.Instr & (1 << 22)
-        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
-        : MapReg(CurInstr.A_Reg(0));
+    ComplexOperand offset = CurInstr.Instr & (1 << 22)
+        ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : ComplexOperand(CurInstr.A_Reg(0), 0, 0);
 
     int op = (CurInstr.Instr >> 5) & 0x3;
     bool load = CurInstr.Instr & (1 << 20);
@@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf()
     if (size == 32 && Num == 1)
         return; // NOP
 
-    if (CurInstr.Instr & (1 << 24))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-        
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
-    }
-    else
-        MOV(32, R(ABI_PARAM1), rn);
-
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
     if (!(CurInstr.Instr & (1 << 24)))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
-    }
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
-
-    if (load && CurInstr.A_Reg(12) == 15)
-        printf("!!! MemHalf op PC %08X\n", R15);;
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
     bool byte = op & 0x1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::A_Comp_LDM_STM()
@@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM()
 
 void Compiler::T_Comp_MemImm()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
     bool byte = op & 0x2;
     u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_MemRegHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op != 0;
     int size = op != 1 ? 16 : 8;
     bool signExtend = op & 1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
+        size, flags);
 }
 
 void Compiler::T_Comp_MemImmHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     u32 offset = (CurInstr.Instr >> 5) & 0x3E;
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 16);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
 
-    // hopefully this doesn't break
-    u32 val; CurCPU->DataRead32(addr, &val);
-    MOV(32, rd, Imm32(val));
+    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
 }
 
 void Compiler::T_Comp_MemSPRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) * 4;
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 32);
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_PUSH_POP()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 9239e29..0fbde26 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -36,7 +36,7 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMemory       = 1 << 20,
+    A_WriteMem          = 1 << 20
 };
 
 #define A_BIOP A_Read16
@@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(
 const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
 const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
 const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
-const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
@@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12 | A_WriteMemory
+#define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double | A_WriteMemory
+#define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 21)
+#define tk(x) ((x) << 22)
 
 enum {
     T_Read0         = 1 << 0,
@@ -210,6 +210,8 @@ enum {
     T_SetMaybeC     = 1 << 18,
     T_ReadC         = 1 << 19,
     T_SetC          = 1 << 20,
+    
+    T_WriteMem      = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
-const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
-const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
 const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
 const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
-const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
 const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
 const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
 const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
 
-const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
 const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
-const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
 const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
-const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
 const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 
-const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
-const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
@@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 21) & 0x3F;
+        res.Kind = (data >> 22) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_SetC)
             res.WriteFlags |= flag_C;
 
+        if (data & T_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
             if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
+
+            if (id == 0x704 || id == 0x782)
+                res.SpecialKind = special_WaitForInterrupt;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
         {
@@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
             res.WriteFlags |= flag_C;
 
+        if (data & A_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d01c600..d02f168 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -226,18 +226,27 @@ enum
     flag_V = 1 << 0,
 };
 
+enum
+{
+    special_NotSpecialAtAll = 0,
+    special_WriteMem,
+    special_WaitForInterrupt
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 SpecialKind;
+
     u8 ReadFlags;
     // lower 4 bits - set always
     // upper 4 bits - might set flag
     u8 WriteFlags;
 
     bool EndBlock;
-    bool Branches()
+    bool Branches() const
     {
         return DstRegs & (1 << 15);
     }
diff --git a/src/CP15.cpp b/src/CP15.cpp
index e6e91c3..10c3b1b 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+        ARMJIT::InvalidateAll();
         ICacheInvalidateAll();
         return;
     case 0x751:
+        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
diff --git a/src/Config.cpp b/src/Config.cpp
index 3cff0ed..63d61a3 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,6 +37,7 @@ int GL_Antialias;
 #ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+bool JIT_BrancheOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index c13eae3..0fcefc3 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -49,6 +49,7 @@ extern int GL_Antialias;
 #ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+extern bool JIT_BrancheOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 1baa308..e9e6795 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -536,7 +536,7 @@ void Reset()
     RCnt = 0;
 
 #ifdef JIT_ENABLED
-    ARMJIT::InvalidateBlockCache();
+    ARMJIT::ResetBlockCache();
 #endif
 
     NDSCart::Reset();
@@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file)
 #ifdef JIT_ENABLED
     if (!file->Saving)
     {
-        ARMJIT::InvalidateBlockCache();
+        ARMJIT::ResetBlockCache();
     }
 #endif
 
@@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 09ea8eb..45e8e0c 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot;
 #ifdef JIT_ENABLED
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
+uiCheckbox* cbJITBranchOptimisations;
 #endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
@@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg)
     bool enableJit = uiCheckboxChecked(cbJITEnabled);
     char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
     long blockSize = strtol(maxBlockSizeStr, NULL, 10);
+    bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
     uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
 
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize)
+    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize ||
+        branchOptimisations != Config::JIT_BrancheOptimisations)
     {
         if (RunningSomething && 
             !uiMsgBoxConfirm(win, "Reset emulator", 
@@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg)
 
         Config::JIT_Enable = enableJit;
         Config::JIT_MaxBlockSize = blockSize;
+        Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
 
         restart = true;
     }
@@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg)
 void OnJITStateChanged(uiCheckbox* cb, void* blarg)
 {
     if (uiCheckboxChecked(cb))
+    {
         uiControlEnable(uiControl(enJITMaxBlockSize));
+        uiControlEnable(uiControl(cbJITBranchOptimisations));
+    }
     else
+    {
         uiControlDisable(uiControl(enJITMaxBlockSize));
+        uiControlDisable(uiControl(cbJITBranchOptimisations));
+    }
 }
 #endif
 
@@ -159,6 +169,14 @@ void Open()
             enJITMaxBlockSize = uiNewEntry();
             uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
         }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)");
+            uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
+        }
     }
 #endif
 
@@ -194,6 +212,8 @@ void Open()
         uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
     }
     OnJITStateChanged(cbJITEnabled, NULL);
+
+    uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
 #endif
 
     uiControlShow(uiControl(win));
-- 
cgit v1.2.3


From 7424f9fda06bd15f0e00717b962a5ca8a00540b7 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:14:33 +0200
Subject: remove leftover debug code

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 25c55a3..a994d34 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -598,8 +598,6 @@ void Compiler::Comp_AddCycles_CDI()
             cycles = numC + numD + 1;
         }
         
-        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
-
         if (!Thumb && CurInstr.Cond() < 0xE)
             ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
-- 
cgit v1.2.3


From aa23f21b8df9780578adf6e6ea6bcfba3fee83bb Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Wed, 16 Oct 2019 23:39:12 +0200
Subject: decrease jit block cache address granularity fixes Dragon Quest IX
 move code with side effects out of assert, fixes release build (thanks to
 m4wx for this one) also remove some leftovers of jit pipelining

---
 src/ARMJIT.cpp                      | 42 ++++++++++++++++++++++---------------
 src/ARMJIT_Internal.h               |  3 +--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 31 ++++++++++++++-------------
 src/ARM_InstrInfo.cpp               | 25 ++++++++++++++--------
 src/ARM_InstrInfo.h                 |  3 ++-
 src/libui_sdl/main.cpp              |  2 ++
 6 files changed, 62 insertions(+), 44 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 686bdd6..19a5e70 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -106,7 +106,7 @@ u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
 JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
-AddressRange CodeRanges[ExeMemSpaceSize / 256];
+AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 TinyVector<JitBlock*> JitBlocks;
 JitBlock* RestoreCandidates[0x1000] = {NULL};
@@ -285,6 +285,13 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
 #undef F_MEM_HD
 #undef F
 
+void T_BL_LONG(ARM* cpu)
+{
+	ARMInterpreter::T_BL_LONG_1(cpu);
+	cpu->R[15] += 2;
+	ARMInterpreter::T_BL_LONG_2(cpu);
+}
+
 #define F(x) ARMInterpreter::T_##x
 InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 {
@@ -302,7 +309,7 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 	F(PUSH), F(POP), F(LDMIA), F(STMIA),
 	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
 	F(UNK), F(SVC), 
-	NULL // BL_LONG psudo opcode
+	T_BL_LONG // BL_LONG psudo opcode
 };
 #undef F
 
@@ -341,7 +348,7 @@ void CompileBlock(ARM* cpu)
 	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
 		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
 		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
-		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
 
@@ -352,7 +359,7 @@ void CompileBlock(ARM* cpu)
 		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
-        instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+        nextInstr[0] = nextInstr[1];
 	
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
@@ -361,7 +368,7 @@ void CompileBlock(ARM* cpu)
 
 		u32 translatedAddr = (cpu->Num == 0
 			? TranslateAddr<0>(instrs[i].Addr)
-			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF;
 		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
 		{
 			bool returning = false;
@@ -400,7 +407,6 @@ void CompileBlock(ARM* cpu)
                 nextInstr[1] = cpuv4->CodeRead32(r15);
             instrs[i].CodeCycles = cpu->CodeCycles;
         }
-        instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
 		cpu->R[15] = r15;
@@ -584,7 +590,7 @@ void CompileBlock(ARM* cpu)
 	for (int j = 0; j < numAddressRanges; j++)
 	{
 		assert(addresseRanges[j] == block->AddressRanges()[j]);
-		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
 	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
@@ -595,7 +601,7 @@ void CompileBlock(ARM* cpu)
 void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
-	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
 	int startLength = range->Blocks.Length;
 	for (int i = 0; i < range->Blocks.Length; i++)
 	{
@@ -604,15 +610,17 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			if ((addr / 256) != (pseudoPhysical / 256))
+			if ((addr / 512) != (pseudoPhysical / 512))
 			{
-				AddressRange* otherRange = &CodeRanges[addr / 256];
+				AddressRange* otherRange = &CodeRanges[addr / 512];
 				assert(otherRange != range);
-				assert(otherRange->Blocks.RemoveByValue(block));
+				bool removed = otherRange->Blocks.RemoveByValue(block);
+				assert(removed);
 			}
 		}
 
-		assert(JitBlocks.RemoveByValue(block));
+		bool removed = JitBlocks.RemoveByValue(block);
+		assert(removed);
 
 		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
@@ -631,14 +639,14 @@ void InvalidateByAddr(u32 pseudoPhysical)
 void InvalidateByAddr7(u32 addr)
 {
 	u32 pseudoPhysical = TranslateAddr<1>(addr);
-	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false))
 		InvalidateByAddr(pseudoPhysical);
 }
 
 void InvalidateITCM(u32 addr)
 {
 	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
-	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+	if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0)
 		InvalidateByAddr(pseudoPhysical);
 }
 
@@ -654,7 +662,7 @@ void InvalidateAll()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeRanges[addr / 256];
+			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
@@ -689,8 +697,8 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 256].Blocks.Clear();
-			CodeRanges[addr / 256].TimesInvalidated = 0;
+			CodeRanges[addr / 512].Blocks.Clear();
+			CodeRanges[addr / 512].TimesInvalidated = 0;
 		}
 		delete block;
 	}
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 4acb488..9e6713d 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -38,7 +38,6 @@ struct FetchedInstr
 	u8 BranchFlags;
 	u8 SetFlags;
     u32 Instr;
-    u32 NextInstr[2];
 	u32 Addr;
 
     u8 CodeCycles;
@@ -185,7 +184,7 @@ struct __attribute__((packed)) AddressRange
 	u16 TimesInvalidated;
 };
 
-extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 13ca415..eb01c87 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -105,7 +105,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
         static_assert(sizeof(AddressRange) == 16);
         LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
         MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-        SHR(32, R(RSCRATCH), Imm8(8));
+        SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
         CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
@@ -203,7 +203,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
         ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
         MOV(32, R(ABI_PARAM4), R(RSCRATCH));
-        SHR(32, R(RSCRATCH), Imm8(8));
+        SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
         CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
@@ -284,28 +284,29 @@ void fault(u32 a, u32 b)
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    if (flags & memop_Store)
-    {
-        Comp_AddCycles_CD();
-    }
-    else
-    {
-        Comp_AddCycles_CDI();
-    }
-
     u32 addressMask = ~0;
     if (size == 32)
         addressMask = ~3;
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
-        Comp_MemLoadLiteral(size, rd, 
-            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+        u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        Comp_MemLoadLiteral(size, rd, addr);
+        return;
     }
-    else
+
     {
+        if (flags & memop_Store)
+        {
+            Comp_AddCycles_CD();
+        }
+        else
+        {
+            Comp_AddCycles_CDI();
+        }
+
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
 
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 0fbde26..1261bbe 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 21)
+#define ak(x) ((x) << 22)
 
 enum {
     A_Read0             = 1 << 0,
@@ -36,7 +36,8 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMem          = 1 << 20
+    A_WriteMem          = 1 << 20,
+    A_LoadMem           = 1 << 21
 };
 
 #define A_BIOP A_Read16
@@ -122,7 +123,7 @@ const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB);
 const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
-#define A_LDR A_Write12
+#define A_LDR A_Write12 | A_LoadMem
 #define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
@@ -143,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(STRB,STR)
 A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
-#define A_LDRD A_Write12Double
+#define A_LDRD A_Write12Double | A_LoadMem
 #define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
@@ -159,10 +160,10 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWPB);
 
-const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
+const u32 A_LDM = A_Read16 | A_MemWriteback | A_LoadMem | ak(ak_LDM);
 const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
@@ -360,6 +361,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if (data & T_WriteMem)
             res.SpecialKind = special_WriteMem;
+        
+        if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+            res.SpecialKind = special_LoadLiteral;
 
         res.EndBlock |= res.Branches();
 
@@ -377,7 +381,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 21) & 0x1FF;
+        res.Kind = (data >> 22) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -454,12 +458,15 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.ReadFlags |= flag_C;
         if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
             res.ReadFlags |= flag_C;
-        if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
+        if ((data & A_SetC) || ((data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)))
             res.WriteFlags |= flag_C;
 
         if (data & A_WriteMem)
             res.SpecialKind = special_WriteMem;
 
+        if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
+            res.SpecialKind = special_LoadLiteral;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d02f168..c032a4f 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -230,7 +230,8 @@ enum
 {
     special_NotSpecialAtAll = 0,
     special_WriteMem,
-    special_WaitForInterrupt
+    special_WaitForInterrupt,
+    special_LoadLiteral
 };
 
 struct Info
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index 0066668..c3db88d 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2675,6 +2675,8 @@ void RecreateMainWindow(bool opengl)
 
 int main(int argc, char** argv)
 {
+    freopen("miauz.txt", "w", stdout);
+
     srand(time(NULL));
 
     printf("melonDS " MELONDS_VERSION "\n");
-- 
cgit v1.2.3


From 81f38c14be0d9ba5a3da8f67d9719ed2c47279c5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 13:29:17 +0200
Subject: integrate changes from ARM64 backend and more - better handle LDM/STM
 in reg alloc - unify Halted and IRQ in anticipation for branch inlining -
 literal optimisations can be disabled in gui - jit blocks follow simple
 returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in
 Pokemon White)

---
 src/ARM.cpp                         | 40 ++++++++++++++++++-----------
 src/ARM.h                           | 13 +++++++---
 src/ARMJIT.cpp                      | 50 +++++++++++++++++++++++++++++++------
 src/ARMJIT_RegisterCache.h          | 33 +++++++++++++++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  7 +++---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++----
 src/ARM_InstrInfo.cpp               | 28 +++++++++++++++++++++
 src/ARM_InstrInfo.h                 |  2 +-
 src/Config.cpp                      |  2 ++
 src/Config.h                        |  1 +
 src/NDS.cpp                         |  2 +-
 src/libui_sdl/DlgEmuSettings.cpp    | 31 ++++++++++++++++++++---
 src/libui_sdl/main.cpp              |  2 --
 13 files changed, 179 insertions(+), 48 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 423c940..4fab60e 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -113,7 +113,7 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&Halted);
+    file->Var32(&StopExecution);
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
@@ -589,16 +589,21 @@ void ARMv5::ExecuteJIT()
         NDS::ARM9Timestamp += Cycles;
         Cycles = 0;
 
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM9Timestamp = NDS::ARM9Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+                {
+                    NDS::ARM9Timestamp = NDS::ARM9Target;
+                }
+                break;
             }
-            break;
         }
     }
 
@@ -726,16 +731,21 @@ void ARMv4::ExecuteJIT()
         Cycles = 0;
 
         // TODO optimize this shit!!!
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM7Timestamp = NDS::ARM7Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+                {
+                    NDS::ARM7Timestamp = NDS::ARM7Target;
+                }
+                break;
             }
-            break;
         }
     }
 
diff --git a/src/ARM.h b/src/ARM.h
index 8a01068..e252d23 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -112,9 +112,16 @@ public:
     u32 Num;
 
     s32 Cycles;
-    u32 Halted;
-
-    u32 IRQ; // nonzero to trigger IRQ
+    union
+    {
+        struct
+        {
+            u8 Halted;
+            u8 IRQ; // nonzero to trigger IRQ
+            u8 IdleLoop;
+        };
+        u32 StopExecution;
+    };
 
     u32 CodeRegion;
     s32 CodeCycles;
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 19a5e70..0695b85 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -16,11 +16,13 @@
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
+#include "NDSCart.h"
 
 namespace ARMJIT
 {
 
 #define JIT_DEBUGPRINT(msg, ...)
+//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
 Compiler* compiler;
 
@@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
+	u32& linkAddr, u32& targetAddr)
 {
 	if (thumb)
 	{
 		u32 r15 = instr.Addr + 4;
 		cond = 0xE;
 
+		link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
+		linkAddr = instr.Addr + 4;
+
 		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
 		{
 			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
@@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	else
 	{
+		link = instr.Info.Kind == ARMInstrInfo::ak_BL;
+		linkAddr = instr.Addr + 4;
+
 		cond = instr.Cond();
 		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
 			|| instr.Info.Kind == ARMInstrInfo::ak_B)
@@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	return false;
 }
@@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
+	u32 lr;
+	bool hasLink = false;
 
     do
     {
@@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
+		if (instrs[i].Info.DstRegs & (1 << 14))
+			hasLink = false;
+
 		if (thumb)
 		{
 			InterpretTHUMB[instrs[i].Info.Kind](cpu);
@@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
-			u32 cond, target;
-			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			bool link;
+			u32 cond, target, linkAddr;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
 			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
 
 			if (staticBranch)
@@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu)
 				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
 				{
 					// we might have an idle loop
-					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
-					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
+					if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
 					{
 						instrs[i].BranchFlags |= branch_IdleBranch;
 						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
 					}
 				}
-				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
 						? TranslateAddr<0>(target)
 						: TranslateAddr<1>(target);
+
+					if (link)
+					{
+						lr = linkAddr;
+						hasLink = true;
+					}
 					
 					r15 = target + (thumb ? 2 : 4);
 					assert(r15 == cpu->R[15]);
@@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu)
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
 	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
 	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
@@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	{
 		if ((addr & 0xFF000000) == 0x04000000)
 		{
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
 			/*
 				unfortunately we can't map GPU2D this way
 				since it's hidden inside an object
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index ed6a2b7..2222bc2 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -93,10 +93,12 @@ public:
 
 	void Prepare(bool thumb, int i)
     {
+        FetchedInstr instr = Instrs[i];
+
         if (LoadedRegs & (1 << 15))
             UnloadRegister(15);
 
-        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
         for (int reg : invalidedLiterals)
             UnloadLiteral(reg);
 
@@ -108,6 +110,7 @@ public:
         {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
+            regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
             for (int reg : regsNeeded)
                 ranking[reg]++;
         }
@@ -117,8 +120,8 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -143,13 +146,31 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            // we don't need to load a value which is always going to be overwritten
             BitSet16 needValueLoaded(needToBeLoaded);
-            if (thumb || Instr.Cond() >= 0xE)
-                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
+            if (thumb || instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
+        } 
+        {
+            BitSet16 loadedSet(LoadedRegs);
+            BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
+            if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
+            {
+                int left = NativeRegsAvailable - loadedSet.Count();
+                for (int reg : loadRegs)
+                {
+                    if (left-- == 0)
+                        break;
+
+                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
+                    LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
+                }
+            }
         }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+
+        DirtyRegs |= writeRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index a994d34..fd38724 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -364,7 +364,7 @@ void Compiler::Reset()
 void Compiler::Comp_SpecialBranchBehaviour()
 {
     if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
     if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
     {
@@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     {
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
 
         Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
 
@@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
-        IrregularCycles = true;
-
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
@@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD()
         IrregularCycles = true;
     }
 
-    if (!Thumb && CurInstr.Cond() < 0xE)
+    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index eb01c87..3799774 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,6 @@
 #include "ARMJIT_Compiler.h"
 
+#include "../Config.h"
 
 using namespace Gen;
 
@@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         Comp_MemLoadLiteral(size, rd, addr);
@@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
+        if (Thumb && rn == 15)
+            rnMapped = Imm32(R15 & ~0x2);
 
         bool inlinePreparation = Num == 1;
         u32 constLocalROR32 = 4;
@@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
             : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
         {
             u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
@@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
-
-    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    u32 addr = (R15 & ~0x2) + offset;
+    if (Config::JIT_LiteralOptimisations)
+        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    else
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 1261bbe..8f8bd35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
             res.SpecialKind = special_LoadLiteral;
 
+        if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            res.NotStrictlyNeeded |= set;
+            res.DstRegs |= set;
+        }
+        if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            if (res.Kind == tk_PUSH && instr & (1 << 8))
+                set |= (1 << 14);
+            res.NotStrictlyNeeded |= set;
+            res.SrcRegs |= set;
+        }
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
             res.SpecialKind = special_LoadLiteral;
+        
+        if (res.Kind == ak_LDM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.DstRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
+        if (res.Kind == ak_STM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.SrcRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
 
         if ((instr >> 28) < 0xE)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index c032a4f..2732181 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -236,7 +236,7 @@ enum
 
 struct Info
 {
-    u16 DstRegs, SrcRegs;
+    u16 DstRegs, SrcRegs, NotStrictlyNeeded;
     u16 Kind;
 
     u8 SpecialKind;
diff --git a/src/Config.cpp b/src/Config.cpp
index 63d61a3..eb5bfcc 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -38,6 +38,7 @@ int GL_Antialias;
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
 bool JIT_BrancheOptimisations = true;
+bool JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -52,6 +53,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 0fcefc3..723ab13 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -50,6 +50,7 @@ extern int GL_Antialias;
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern bool JIT_BrancheOptimisations;
+extern bool JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index e9e6795..141c565 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1142,7 +1142,7 @@ void UpdateIRQ(u32 cpu)
 
     if (IME[cpu] & 0x1)
     {
-        arm->IRQ = IE[cpu] & IF[cpu];
+        arm->IRQ = !!(IE[cpu] & IF[cpu]);
     }
     else
     {
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 45e8e0c..0df9c6c 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -43,6 +43,7 @@ uiCheckbox* cbDirectBoot;
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
 uiCheckbox* cbJITBranchOptimisations;
+uiCheckbox* cbJITLiteralOptimisations;
 #endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
@@ -66,14 +67,16 @@ void OnOk(uiButton* btn, void* blarg)
     char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
     long blockSize = strtol(maxBlockSizeStr, NULL, 10);
     bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
+    bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations);
     uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
 
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize ||
-        branchOptimisations != Config::JIT_BrancheOptimisations)
+    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize
+        || branchOptimisations != Config::JIT_BrancheOptimisations
+        || literalOptimisations != Config::JIT_LiteralOptimisations)
     {
         if (RunningSomething && 
             !uiMsgBoxConfirm(win, "Reset emulator", 
@@ -82,7 +85,8 @@ void OnOk(uiButton* btn, void* blarg)
 
         Config::JIT_Enable = enableJit;
         Config::JIT_MaxBlockSize = blockSize;
-        Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
+        Config::JIT_BrancheOptimisations = branchOptimisations;
+        Config::JIT_LiteralOptimisations = literalOptimisations;
 
         restart = true;
     }
@@ -108,11 +112,13 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg)
     {
         uiControlEnable(uiControl(enJITMaxBlockSize));
         uiControlEnable(uiControl(cbJITBranchOptimisations));
+        uiControlEnable(uiControl(cbJITLiteralOptimisations));
     }
     else
     {
         uiControlDisable(uiControl(enJITMaxBlockSize));
         uiControlDisable(uiControl(cbJITBranchOptimisations));
+        uiControlDisable(uiControl(cbJITLiteralOptimisations));
     }
 }
 #endif
@@ -174,9 +180,25 @@ void Open()
             uiBox* row = uiNewHorizontalBox();
             uiBoxAppend(in_ctrl, uiControl(row), 0);
 
-            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)");
+            uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:");
+            uiBoxAppend(row, uiControl(lbl), 0);
+        }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations");
             uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
         }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations");
+            uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0);
+        }
     }
 #endif
 
@@ -214,6 +236,7 @@ void Open()
     OnJITStateChanged(cbJITEnabled, NULL);
 
     uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
+    uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations);
 #endif
 
     uiControlShow(uiControl(win));
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index c3db88d..0066668 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl)
 
 int main(int argc, char** argv)
 {
-    freopen("miauz.txt", "w", stdout);
-
     srand(time(NULL));
 
     printf("melonDS " MELONDS_VERSION "\n");
-- 
cgit v1.2.3


From 803c61e1266040c631a716a37105615a998a38af Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 18:03:31 +0200
Subject: fix config key for jit literal optimisations

---
 src/Config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/Config.cpp b/src/Config.cpp
index eb5bfcc..be6a833 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -53,7 +53,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
+    {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
-- 
cgit v1.2.3


From 386100c053adad10ab7de066d37f383d58d5cfa1 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 3 Nov 2019 15:33:20 +0100
Subject: make literal optimisation more reliable fixes spanish Pokemon
 HeartGold

---
 src/ARMJIT.cpp                      | 52 +++++++++++++++++++++++++++++++++----
 src/ARMJIT.h                        |  2 +-
 src/ARMJIT_Internal.h               |  3 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 34 +++++++++++++++++++-----
 4 files changed, 77 insertions(+), 14 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 0695b85..c7387c9 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -161,6 +161,27 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
+bool DecodeLiteral(const FetchedInstr& instr, u32& addr)
+{
+	switch (instr.Info.Kind)
+	{
+	case ARMInstrInfo::ak_STR_IMM:
+	case ARMInstrInfo::ak_STRB_IMM:
+		addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
+		return true;
+	case ARMInstrInfo::ak_STRD_IMM:
+	case ARMInstrInfo::ak_STRH_IMM:
+		addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
+		return true;
+	case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever}
+		addr = instr.Addr + 8;
+		return true;
+	default:
+		JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr);
+		return false;
+	}
+}
+
 bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
 	u32& linkAddr, u32& targetAddr)
 {
@@ -463,6 +484,23 @@ void CompileBlock(ARM* cpu)
 		instrs[i].DataCycles = cpu->DataCycles;
 		instrs[i].DataRegion = cpu->DataRegion;
 
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem 
+			&& instrs[i].Info.SrcRegs == (1 << 15)
+			&& instrs[i].Info.DstRegs == 0)
+		{
+			assert (!thumb);
+
+			u32 addr;
+			if (DecodeLiteral(instrs[i], addr))
+			{
+				JIT_DEBUGPRINT("pc relative write detected\n");
+				u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+				ARMJIT::InvalidateByAddr(translatedAddr, false);
+				CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16));
+			}
+		}
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -631,7 +669,7 @@ void CompileBlock(ARM* cpu)
 	JitBlocks.Add(block);
 }
 
-void InvalidateByAddr(u32 pseudoPhysical)
+void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
@@ -657,11 +695,14 @@ void InvalidateByAddr(u32 pseudoPhysical)
 
 		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
+		if (mayRestore)
+		{
+			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+				delete RestoreCandidates[slot];
 
-		RestoreCandidates[slot] = block;
+			RestoreCandidates[slot] = block;
+		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
 		range->TimesInvalidated++;
@@ -732,6 +773,7 @@ void ResetBlockCache()
 			u32 addr = block->AddressRanges()[j];
 			CodeRanges[addr / 512].Blocks.Clear();
 			CodeRanges[addr / 512].TimesInvalidated = 0;
+			CodeRanges[addr / 512].InvalidLiterals = 0;
 		}
 		delete block;
 	}
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 1db4d66..09cc463 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -61,7 +61,7 @@ inline JitBlockEntry LookUpBlock(u32 addr)
 void Init();
 void DeInit();
 
-void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true);
 void InvalidateAll();
 
 void InvalidateITCM(u32 addr);
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 9e6713d..fb05f75 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -63,7 +63,7 @@ struct __attribute__((packed)) TinyVector
 {
 	T* Data = NULL;
 	u16 Capacity = 0;
-	u32 Length = 0; // make it 32 bit so we don't need movzx
+	u16 Length = 0;
 
 	~TinyVector()
 	{
@@ -181,6 +181,7 @@ private:
 struct __attribute__((packed)) AddressRange
 {
 	TinyVector<JitBlock*> Blocks;
+	u16 InvalidLiterals;
 	u16 TimesInvalidated;
 };
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 3799774..82f80a7 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -108,7 +108,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
         MOV(32, R(RSCRATCH), R(ABI_PARAM1));
         SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
         JMP((u8*)InvalidateByAddr, true);
         SetJumpTarget(noCode);
@@ -206,7 +206,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(ABI_PARAM4), R(RSCRATCH));
         SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
         ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
         MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
@@ -278,10 +278,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
     Comp_AddCycles_CDI();
 }
 
-void fault(u32 a, u32 b)
+/*void fault(u32 a, u32 b, u32 c, u32 d)
 {
-    printf("actually not static! %x %x\n", a, b);
-}
+    printf("actually not static! %x %x %x %x\n", a, b, c, d);
+}*/
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
@@ -291,11 +291,17 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
+    //bool check = false;
     if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        Comp_MemLoadLiteral(size, rd, addr);
-        return;
+        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
+        {
+            Comp_MemLoadLiteral(size, rd, addr);
+            return;
+        }
     }
 
     {
@@ -438,6 +444,20 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         CALL(memoryFunc);
 
+        /*if (Num == 0 && check)
+        {
+            CMP(32, R(EAX), rdMapped);
+            FixupBranch notEqual = J_CC(CC_E);
+            ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0);
+            MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8)));
+            MOV(32, R(ABI_PARAM2), R(EAX));
+            MOV(32, R(ABI_PARAM3), rdMapped);
+            MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr));
+            CALL((u8*)fault);
+            ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0);
+            SetJumpTarget(notEqual);
+        }*/
+
         if (!(flags & memop_Store))
         {
             if (inlinePreparation && size == 32)
-- 
cgit v1.2.3


From 0c5311731b8e249c17ce68af1d026aca230e7711 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 5 Nov 2019 18:50:17 +0100
Subject: make savestates 100% compatible again

---
 src/ARM.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 4fab60e..9ab9546 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -113,7 +113,11 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&StopExecution);
+
+    // hack to make save states compatible
+    u32 halted = Halted;
+    file->Var32(&halted);
+    Halted = halted;
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
-- 
cgit v1.2.3


From 60650fa82e03dc8eb2a6118ce4cf2e4b0aa872e5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 6 Dec 2019 22:16:23 +0100
Subject: disable literal optimations in DTCM

---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 82f80a7..b66f304 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -347,8 +347,10 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 // stupid dtcm...
                 if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
                 {
-                    region.Mem = cpu5->DTCM;
-                    region.Mask = 0x3FFF;
+                    // disable this for now as DTCM is located in heap
+                    // which might excced the RIP-addressable range
+                    //region.Mem = cpu5->DTCM;
+                    //region.Mask = 0x3FFF;
                 }
                 else
                 {
-- 
cgit v1.2.3


From 9b98b8816a1dc1373ce9a57aef845263456702c3 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 17:28:51 +0100
Subject: improve nop handling and proper behaviour for LDM^ fixes dslinux

---
 src/ARM.cpp                         |  2 ++
 src/ARMJIT.cpp                      | 13 +++++++++----
 src/ARMJIT_RegisterCache.h          |  2 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  6 +++---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  1 +
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 ++
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  5 +++--
 src/ARM_InstrInfo.cpp               |  2 ++
 src/ARM_InstrInfo.h                 |  2 ++
 9 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 9ab9546..07cc472 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -725,6 +725,8 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
+        //printf("executing armv4 at %08x\n", instrAddr);
+
         ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
         if (block)
             Cycles += block();
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index c7387c9..8fd7708 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -273,6 +273,8 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 
+void NOP(ARM* cpu) {}
+
 #define F(x) &ARMInterpreter::A_##x
 #define F_ALU(name, s) \
 	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
@@ -320,7 +322,8 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
 	F(LDM), F(STM),
 
 	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
-	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC),
+	NOP
 };
 #undef F_ALU
 #undef F_MEM_WB
@@ -387,8 +390,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
-		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
 		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
@@ -473,7 +476,9 @@ void CompileBlock(ARM* cpu)
 			else
 			{
                 u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
-				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode]
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop);
 				if (cpu->CheckCondition(instrs[i].Cond()))
 					InterpretARM[instrs[i].Info.Kind](cpu);
 				else
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 2222bc2..b894657 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -152,7 +152,7 @@ public:
                 needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
-        } 
+        }
         {
             BitSet16 loadedSet(LoadedRegs);
             BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 0dedb3f..e02865d 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -134,7 +134,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
     IrregularCycles = true;
 
-    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
+    BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
 
@@ -156,12 +156,12 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     if (!restoreCPSR)
         XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
     else
-        MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+        MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste
     if (Num == 0)
         CALL((void*)&ARMv5::JumpTo);
     else
         CALL((void*)&ARMv4::JumpTo);
-    
+
     if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
     {
         for (int reg : hiRegsLoaded)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fd38724..5afe842 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -308,6 +308,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
     // system stuff
     NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    F(Nop)
 };
 
 const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 792ff66..2cb57dc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -79,6 +79,8 @@ public:
         opInvertOp2 = 1 << 5,
     };
 
+    void Nop() {}
+
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b66f304..4cafc1c 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -531,7 +531,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         {
             if (regs[reg])
             {
-                if (usermode && reg >= 8 && reg < 15)
+                if (usermode && !regs[15] && reg >= 8 && reg < 15)
                 {
                     if (firstUserMode)
                     {
@@ -545,7 +545,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
                     if (RegCache.Mapping[reg] != INVALID_REG)
                         MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
-                    SaveReg(reg, ABI_PARAM3);
+                    else
+                        SaveReg(reg, ABI_PARAM3);
                     SetJumpTarget(sucessfulWritten);
                 }
                 else if (RegCache.Mapping[reg] == INVALID_REG)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 8f8bd35..08e2f0a 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -392,6 +392,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
         u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
         if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
+        else if ((instr >> 28) == 0xF)
+            data = ak(ak_Nop);
 
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 2732181..6ab4929 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -139,6 +139,8 @@ enum
     ak_MRC,
     ak_SVC,
 
+    ak_Nop,
+
     ak_Count,
 
     tk_LSL_IMM = 0,
-- 
cgit v1.2.3


From 842df432aa05f16cdf76b96ab3523d99059cdd1b Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 17:38:04 +0100
Subject: remove debug leftovers

---
 src/ARM.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 07cc472..9ab9546 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -725,8 +725,6 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        //printf("executing armv4 at %08x\n", instrAddr);
-
         ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
         if (block)
             Cycles += block();
-- 
cgit v1.2.3


From d6cc7de6c4b571b24809a0d9665ec6160fe5ff6d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:29:52 +0100
Subject: move ARM64 JIT backend here

---
 CMakeLists.txt                      |    2 +-
 src/ARM.h                           |    9 +-
 src/ARMJIT.cpp                      |    4 +
 src/ARMJIT_A64/ARMJIT_ALU.cpp       |  837 +++++++
 src/ARMJIT_A64/ARMJIT_Branch.cpp    |  452 ++++
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  |  707 ++++++
 src/ARMJIT_A64/ARMJIT_Compiler.h    |  234 ++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp |  848 +++++++
 src/ARM_InstrInfo.cpp               |    7 +-
 src/CMakeLists.txt                  |   27 +-
 src/dolphin/Align.h                 |   24 +
 src/dolphin/Arm64Emitter.cpp        | 4466 +++++++++++++++++++++++++++++++++++
 src/dolphin/Arm64Emitter.h          | 1152 +++++++++
 src/dolphin/ArmCommon.h             |   27 +
 src/dolphin/BitUtils.h              |  254 ++
 src/dolphin/Compat.h                |   12 +
 src/dolphin/MathUtil.cpp            |   13 +
 src/dolphin/MathUtil.h              |  121 +
 18 files changed, 9188 insertions(+), 8 deletions(-)
 create mode 100644 src/ARMJIT_A64/ARMJIT_ALU.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Branch.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.h
 create mode 100644 src/ARMJIT_A64/ARMJIT_LoadStore.cpp
 create mode 100644 src/dolphin/Align.h
 create mode 100644 src/dolphin/Arm64Emitter.cpp
 create mode 100644 src/dolphin/Arm64Emitter.h
 create mode 100644 src/dolphin/ArmCommon.h
 create mode 100644 src/dolphin/BitUtils.h
 create mode 100644 src/dolphin/MathUtil.cpp
 create mode 100644 src/dolphin/MathUtil.h

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d59e19c..9a0388d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,7 @@ detect_architecture("__i386__" x86)
 detect_architecture("__arm__" ARM)
 detect_architecture("__aarch64__" ARM64)
 
-if (ARCHITECTURE STREQUAL x86_64)
+if (ARCHITECTURE STREQUAL x86_64 OR ARCHITECTURE STREQUAL ARM64)
 	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
 endif()
 
diff --git a/src/ARM.h b/src/ARM.h
index e252d23..8282c01 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -246,10 +246,14 @@ public:
 
     u32 DTCMSetting, ITCMSetting;
 
-    u8 ITCM[0x8000];
+    // for aarch64 JIT they need to go up here
+    // to be addressable by a 12-bit immediate
     u32 ITCMSize;
-    u8 DTCM[0x4000];
     u32 DTCMBase, DTCMSize;
+    s32 RegionCodeCycles;
+
+    u8 ITCM[0x8000];
+    u8 DTCM[0x4000];
 
     u8 ICache[0x2000];
     u32 ICacheTags[64*4];
@@ -274,7 +278,6 @@ public:
     // code/16N/32N/32S
     u8 MemTimings[0x100000][4];
 
-    s32 RegionCodeCycles;
     u8* CurICacheLine;
 };
 
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 8fd7708..561fabb 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -6,7 +6,11 @@
 #include "Config.h"
 
 #include "ARMJIT_Internal.h"
+#if defined(__x86_64__)
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
+#else
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#endif
 
 #include "ARMInterpreter_ALU.h"
 #include "ARMInterpreter_LoadStore.h"
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..0fe6a97
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -0,0 +1,837 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+
+    UBFX(W1, rs, 0, 8);
+
+    if (!S)
+    {
+        if (op == 3)
+            RORV(W0, op2.Reg.Rm, W1);
+        else
+        {
+            CMP(W1, 32);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GE);
+                ASRV(W0, op2.Reg.Rm, W1);
+            }
+            else
+            {
+                if (op == 0)
+                    LSLV(W0, op2.Reg.Rm, W1);
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+    }
+    else
+    {
+        MOV(W0, op2.Reg.Rm);
+        FixupBranch zero = CBZ(W1);
+
+        SUB(W1, W1, 1);
+        if (op == 3)
+        {
+            RORV(W0, op2.Reg.Rm, W1);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        else
+        {
+            CMP(W1, 31);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GT);
+                ASRV(W0, op2.Reg.Rm, W1);
+                BFI(RCPSR, W0, 29, 1);
+            }
+            else
+            {
+                if (op == 0)
+                {
+                    LSLV(W0, op2.Reg.Rm, W1);
+                    UBFX(W1, W0, 31, 1);
+                }
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W1, WZR, op ? W0 : W1, CC_GT);
+                BFI(RCPSR, W1, 29, 1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+
+        MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1));
+        SetJumpTarget(zero);
+    }
+    op2 = Op2(W0, ST_LSL, 0);
+}
+
+void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+    
+    switch (op)
+    {
+    case 0: // LSL
+        if (S && amount)
+        {
+            UBFX(tmp, op2.Reg.Rm, 32 - amount, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSL, amount);
+        return;
+    case 1: // LSR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        if (amount == 0)
+        {
+            op2 = Op2(0);
+            return;
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSR, amount);
+        return;
+    case 2: // ASR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31);
+        return;
+    case 3: // ROR
+        if (amount == 0)
+        {
+            UBFX(tmp, RCPSR, 29, 1);
+            LSL(tmp, tmp, 31);
+            if (S)
+                BFI(RCPSR, op2.Reg.Rm, 29, 1);
+            ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1));
+
+            op2 = Op2(tmp, ST_LSL, 0);
+        }
+        else
+        {
+            if (S)
+            {
+                UBFX(tmp, op2.Reg.Rm, amount - 1, 1);
+                BFI(RCPSR, tmp, 29, 1);
+            }
+            op2 = Op2(op2.Reg.Rm, ST_ROR, amount);
+        }
+        return;
+    }
+}
+
+void Compiler::Comp_RetriveFlags(bool retriveCV)
+{
+    if (CurInstr.SetFlags)
+        CPSRDirty = true;
+
+    if (CurInstr.SetFlags & 0x4)
+    {
+        CSET(W0, CC_EQ);
+        BFI(RCPSR, W0, 30, 1);
+    }
+    if (CurInstr.SetFlags & 0x8)
+    {
+        CSET(W0, CC_MI);
+        BFI(RCPSR, W0, 31, 1);
+    }
+    if (retriveCV)
+    {
+        if (CurInstr.SetFlags & 0x2)
+        {
+            CSET(W0, CC_CS);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        if (CurInstr.SetFlags & 0x1)
+        {
+            CSET(W0, CC_VS);
+            BFI(RCPSR, W0, 28, 1);
+        }
+    }
+}
+
+void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    switch (op)
+    {
+    case 0x0: // AND
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, op2.Imm, W0);
+            else
+                AND(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x1: // EOR
+        if (op2.IsImm)
+            EORI2R(rd, rn, op2.Imm, W0);
+        else
+            EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xC: // ORR
+        if (op2.IsImm)
+            ORRI2R(rd, rn, op2.Imm, W0);
+        else
+            ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xE: // BIC
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    bool CVInGP = false;
+    switch (op)
+    {
+    case 0x2: // SUB
+        if (S)
+        {
+            if (op2.IsImm)
+                SUBSI2R(rd, rn, op2.Imm, W0);
+            else
+                SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+            {
+                MOVI2R(W2, op2.Imm);
+                SUBI2R(rd, rn, op2.Imm, W0);
+            }
+            else
+                SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            op2 = Op2(WZR);
+        }
+        else if (op2.IsImm)
+        {
+            MOVI2R(W1, op2.Imm);
+            op2 = Op2(W1);
+        }
+        else if (op2.Reg.ShiftAmount != 0)
+        {
+            MOV(W1, op2.Reg.Rm, op2.ToArithOption());
+            op2 = Op2(W1);
+        }
+
+        if (S)
+            SUBS(rd, op2.Reg.Rm, rn);
+        else
+            SUB(rd, op2.Reg.Rm, rn);
+        break;
+    case 0x4: // ADD
+        if (S)
+        {
+            if (op2.IsImm)
+                ADDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ADDI2R(rd, rn, op2.Imm, W0);
+            else
+                ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x5: // ADC
+        UBFX(W2, RCPSR, 29, 1);
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, rn, W2);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm, W0);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, rn, W2);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm, W0);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x6: // SBC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -op2 - 1
+        if (op2.IsImm)
+            MOVI2R(W1, ~op2.Imm);
+        else
+            ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            ADDS(rd, rn, W1);
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            ADD(rd, rn, W1);
+        }
+        break;
+    case 0x7: // RSC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -rn - 1
+        MVN(W1, rn);
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+    {
+        if (CVInGP)
+        {
+            BFI(RCPSR, W2, 29, 1);
+            BFI(RCPSR, W3, 28, 1);
+        }
+        Comp_RetriveFlags(!CVInGP);
+    }
+}
+
+void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    switch (op)
+    {
+    case 0x8: // TST
+        if (op2.IsImm)
+            TSTI2R(rn, op2.Imm, W0);
+        else
+            ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0x9: // TEQ
+        if (op2.IsImm)
+            EORI2R(W0, rn, op2.Imm, W0);
+        else
+            EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption());
+        TST(W0, W0);
+        break;
+    case 0xA: // CMP
+        if (op2.IsImm)
+            CMPI2R(rn, op2.Imm, W0);
+        else
+            CMP(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0xB: // CMN
+        if (op2.IsImm)
+            ADDSI2R(WZR, rn, op2.Imm, W0);
+        else
+            CMN(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    }
+
+    Comp_RetriveFlags(op >= 0xA);
+}
+
+// also counts cycles!
+void Compiler::A_Comp_GetOp2(bool S, Op2& op2)
+{
+    if (CurInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));    
+    }
+    else
+    {
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        op2.Reg.Rm = MapReg(CurInstr.A_Reg(0));
+        if (CurInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+
+            ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+            if (CurInstr.A_Reg(0) == 15)
+            {
+                ADD(W0, op2.Reg.Rm, 4);
+                op2.Reg.Rm = W0;
+            }
+            Comp_RegShiftReg(op, S, op2, rs);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+
+            int amount = (CurInstr.Instr >> 7) & 0x1F;
+            Comp_RegShiftImm(op, amount, S, op2);
+        }
+    }
+}
+
+void Compiler::A_Comp_ALUCmpOp()
+{
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(op <= 0x9, op2);
+    
+    Comp_Compare(op, rn, op2);
+}
+
+void Compiler::A_Comp_ALUMovOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    Op2 op2;
+    A_Comp_GetOp2(S, op2);
+
+    if (op == 0xF) // MVN
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm);
+            MOVI2R(rd, ~op2.Imm);
+        }
+        else
+            ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption());
+    }
+    else // MOV
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm);
+            MOVI2R(rd, op2.Imm);
+        }
+        else
+            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+    }
+
+    if (S)
+    {
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_ALUTriOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    bool logical = (1 << op) & 0xF303;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(S && logical, op2);
+
+    if (op2.IsImm && op2.Imm == 0)
+        op2 = Op2(WZR, ST_LSL, 0);
+    
+    if (logical)
+        Comp_Logical(op, S, rd, rn, op2);
+    else
+        Comp_Arithmetic(op, S, rd, rn, op2);
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_Clz()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+
+    CLZ(rd, rm);
+
+    assert(Num == 0);
+}
+
+void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn)
+{
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        CLZ(W0, rs);
+        CLS(W1, rs);
+        CMP(W0, W1);
+        CSEL(W0, W0, W1, CC_GT);
+        Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (mla)
+        MADD(rd, rm, rs, rn);
+    else
+        MUL(rd, rm, rs);
+
+    if (S && FlagsNZNeeded())
+    {
+        TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+}
+
+void Compiler::A_Comp_Mul_Long()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
+
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        CLZ(W0, rs);
+        CLS(W1, rs);
+        CMP(W0, W1);
+        CSEL(W0, W0, W1, CC_GT);
+        Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (add)
+    {
+        MOV(W0, rn);
+        BFI(X0, EncodeRegTo64(rd), 32, 32);
+        if (sign)
+            SMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        else
+            UMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    else
+    {
+        if (sign)
+            SMULL(EncodeRegTo64(rn), rm, rs);
+        else
+            UMULL(EncodeRegTo64(rn), rm, rs);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::A_Comp_Mul()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool mla = CurInstr.Instr & (1 << 21);
+    ARM64Reg rn = INVALID_REG;
+    if (mla)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_Mul_Mla(S, mla, rd, rm, rs, rn);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    Comp_AddCycles_C();
+
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    Op2 op2;
+    op2.Reg.Rm = MapReg(CurInstr.T_Reg(3));
+    Comp_RegShiftImm(op, amount, true, op2);
+    if (op2.IsImm)
+        MOVI2R(rd, op2.Imm);
+    else
+        MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+    if (FlagsNZNeeded())
+        TST(rd, rd);
+
+    Comp_RetriveFlags(false);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    Comp_AddCycles_C();
+
+    Op2 op2;
+    if (CurInstr.Instr & (1 << 10))
+        op2 = Op2((CurInstr.Instr >> 6) & 0x7);
+    else
+        op2 = Op2(MapReg(CurInstr.T_Reg(6)));
+    
+    Comp_Arithmetic(
+        CurInstr.Instr & (1 << 9) ? 0x2 : 0x4,
+        true,
+        MapReg(CurInstr.T_Reg(0)),
+        MapReg(CurInstr.T_Reg(3)),
+        op2);
+}
+
+void Compiler::T_Comp_ALUImm8()
+{
+    Comp_AddCycles_C();
+
+    u32 imm = CurInstr.Instr & 0xFF;
+    int op = (CurInstr.Instr >> 11) & 0x3;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+
+    switch (op)
+    {
+    case 0:
+        MOVI2R(rd, imm);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    case 1:
+        Comp_Compare(0xA, rd, Op2(imm));
+        break;
+    case 2:
+    case 3:
+        Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm));
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU()
+{
+    int op = (CurInstr.Instr >> 6) & 0xF;
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.T_Reg(3));
+    
+    if ((op >= 0x2 && op <= 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1);
+    else
+        Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0:
+        Comp_Logical(0x0, true, rd, rd, Op2(rs));
+        break;
+    case 0x1:
+        Comp_Logical(0x1, true, rd, rd, Op2(rs));
+        break;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {   
+            Op2 op2;
+            op2.Reg.Rm = rd;
+            Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs);
+            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            if (FlagsNZNeeded())
+                TST(rd, rd);
+            Comp_RetriveFlags(false);
+        }
+        break;
+    case 0x5:
+        Comp_Arithmetic(0x5, true, rd, rd, Op2(rs));
+        break;
+    case 0x6:
+        Comp_Arithmetic(0x6, true, rd, rd, Op2(rs));
+        break;
+    case 0x8:
+        Comp_Compare(0x8, rd, Op2(rs));
+        break;
+    case 0x9:
+        Comp_Arithmetic(0x3, true, rd, rs, Op2(0));
+        break;
+    case 0xA:
+        Comp_Compare(0xA, rd, Op2(rs));
+        break;
+    case 0xB:
+        Comp_Compare(0xB, rd, Op2(rs));
+        break;
+    case 0xC:
+        Comp_Logical(0xC, true, rd, rd, Op2(rs));
+        break;
+    case 0xD:
+        Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG);
+        break;
+    case 0xE:
+        Comp_Logical(0xE, true, rd, rd, Op2(rs));
+        break;
+    case 0xF:
+        MVN(rd, rs);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0:
+        Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs));
+        break;
+    case 1:
+        Comp_Compare(0xA, rdMapped, rs);
+        return;
+    case 2:
+        MOV(rdMapped, rs);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        Comp_JumpTo(rdMapped, false, false);
+    }
+}
+
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg sp = MapReg(13);
+    u32 offset = (CurInstr.Instr & 0x7F) << 2;
+    if (CurInstr.Instr & (1 << 7))
+        SUB(sp, sp, offset);
+    else
+        ADD(sp, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        ARM64Reg sp = MapReg(13);
+        ADD(rd, sp, offset);
+    }
+    else
+        MOVI2R(rd, (R15 & ~2) + offset);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..542f0b7
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -0,0 +1,452 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+// hack
+const int kCodeCacheTiming = 3;
+
+namespace ARMJIT
+{
+
+template <typename T>
+void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR)
+{
+    cpu->JumpTo(addr, changeCPSR);
+}
+
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    // it's not completely safe to assume stuff like, which instructions to preload
+    // we'll see how it works out
+
+    IrregularCycles = true;
+
+    u32 newPC;
+    u32 cycles = 0;
+    bool setupRegion = false;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        ORRI2R(RCPSR, RCPSR, 0x20);
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        ANDI2R(RCPSR, RCPSR, ~0x20);
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 oldregion = R15 >> 24;
+        u32 newregion = addr >> 24;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        MOVI2R(W0, regionCodeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+        setupRegion = newregion != oldregion;
+        if (setupRegion)
+            cpu9->SetupCodeMem(addr);
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                cpu9->CodeRead32(addr-2, true) >> 16;
+                cycles += cpu9->CodeCycles;
+                cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                cpu9->CodeRead32(addr, true);
+                cycles += cpu9->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+        if (setupRegion)
+            cpu9->SetupCodeMem(R15);
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        MOVI2R(W0, codeRegion);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion));
+        MOVI2R(W0, codeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
+    }
+
+    if (Exit)
+    {
+        MOVI2R(W0, newPC);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+    }
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+
+void* Compiler::Gen_JumpTo9(int kind)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    MOVI2R(W2, kCodeCacheTiming);
+    // W1 - code cycles non branch
+    // W2 - branch code cycles
+    LSR(W1, W0, 12);
+    LSL(W1, W1, 2);
+    ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
+    LDRB(W1, RCPU, W1);
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+    CMP(W0, W3);
+    FixupBranch outsideITCM = B(CC_LO);
+    MOVI2R(W1, 1);
+    MOVI2R(W2, 1);
+    SetJumpTarget(outsideITCM);
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+
+    if (kind == 0 || kind == 1)
+    {
+        ANDI2R(W0, W0, ~3);
+
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ADD(W3, W0, 4);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        ADD(W1, W1, W2);
+        ADD(RCycles, RCycles, W1);
+
+        RET();
+    }
+    if (kind == 0 || kind == 2)
+    {
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        ANDI2R(W0, W0, ~1);
+
+        ADD(W3, W0, 2);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        FixupBranch halfwordLoc = TBZ(W0, 1);
+        ADD(W1, W1, W2);
+        ADD(RCycles, RCycles, W1);
+        RET();
+
+        SetJumpTarget(halfwordLoc);
+        ADD(RCycles, RCycles, W2);
+        RET();
+    }
+
+    return res;
+}
+
+void* Compiler::Gen_JumpTo7(int kind)
+{
+    void* res = GetRXPtr();
+
+    LSR(W1, W0, 24);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion));
+    LSR(W1, W0, 15);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles));
+
+    MOVP2R(X2, NDS::ARM7MemTimings);
+    LDR(W3, X2, ArithOption(W1, true));
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+    
+    if (kind == 0 || kind == 1)
+    {
+        UBFX(W2, W3, 0, 8);
+        UBFX(W3, W3, 8, 8);
+        ADD(W2, W3, W2);
+        ADD(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~3);
+
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ADD(W3, W0, 4);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+    if (kind == 0 || kind == 2)
+    {
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        UBFX(W2, W3, 16, 8);
+        UBFX(W3, W3, 24, 8);
+        ADD(W2, W3, W2);
+        ADD(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~1);
+
+        ADD(W3, W0, 2);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+
+    return res;
+}
+
+void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR)
+{
+    IrregularCycles = true;
+
+    if (!restoreCPSR)
+    {
+        if (switchThumb)
+            CPSRDirty = true;
+        MOV(W0, addr);
+        BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]);
+    }
+    else
+    {
+        BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
+        bool previouslyDirty = CPSRDirty;
+        SaveCPSR();
+
+        if (restoreCPSR)
+        {
+            if (Thumb || CurInstr.Cond() >= 0xE)
+                RegCache.Flush();
+            else
+            {
+                // the ugly way...
+                // we only save them, to load and save them again
+                for (int reg : hiRegsLoaded)
+                    SaveReg(reg, RegCache.Mapping[reg]);
+            }
+        }
+
+        if (switchThumb)
+            MOV(W1, addr);
+        else
+        {
+            if (Thumb)
+                ORRI2R(W1, addr, 1);
+            else
+                ANDI2R(W1, addr, ~1);
+        }
+        MOV(X0, RCPU);
+        MOVI2R(W2, restoreCPSR);
+        if (Num == 0)
+            QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
+        else
+            QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
+        
+        if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
+        {
+            for (int reg : hiRegsLoaded)
+                LoadReg(reg, RegCache.Mapping[reg]);
+        }
+
+        if (previouslyDirty)
+            LoadCPSR();
+        CPSRDirty = previouslyDirty;
+    }
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOVI2R(MapReg(14), R15 - 4);
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(W0, rn);
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOVI2R(MapReg(14), R15 - 4);
+    Comp_JumpTo(W0, true);
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    Comp_BranchSpecialBehaviour();
+
+    FixupBranch skipFailed = B();
+    SetJumpTarget(skipExecute);
+    Comp_AddCycles_C(true);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        SaveCPSR(false);
+        RegCache.PrepareExit();
+        
+        ADD(W0, RCycles, ConstantCycles);
+        ABI_PopRegisters(SavedRegs);
+        RET();
+    }
+
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+
+    if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(W0, MapReg(CurInstr.A_Reg(3)));
+        MOVI2R(MapReg(14), R15 - 1);
+        Comp_JumpTo(W0, true);
+    }
+    else
+    {
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn, true);
+    }
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOVI2R(MapReg(14), R15 + offset);
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    ARM64Reg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    ADD(W0, lr, offset);
+    MOVI2R(lr, (R15 - 2) | 1);
+    Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12)));
+}
+
+void Compiler::T_Comp_BL_Merged()
+{
+    Comp_AddCycles_C();
+
+    R15 += 2;
+
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
+        target |= 1;
+
+    MOVI2R(MapReg(14), (R15 - 2) | 1);
+    
+    Comp_JumpTo(target);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..89d0029
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -0,0 +1,707 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include "../ARMJIT_Internal.h"
+
+#ifdef __SWITCH__
+#include "../switch/compat_switch.h"
+
+extern char __start__;
+#endif
+
+#include <malloc.h>
+
+using namespace Arm64Gen;
+
+
+namespace ARMJIT
+{
+
+/*
+
+    Recompiling classic ARM to ARMv8 code is at the same time
+    easier and trickier than compiling to a less related architecture
+    like x64. At one hand you can translate a lot of instructions directly.
+    But at the same time, there are a ton of exceptions, like for
+    example ADD and SUB can't have a RORed second operand on ARMv8.
+ */
+
+template <>
+const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
+    {W19, W20, W21, W22, W23, W24, W25, W26};
+template <>
+const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
+
+const int JitMemSize = 16 * 1024 * 1024;
+
+void Compiler::MovePC()
+{
+    ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
+}
+
+Compiler::Compiler()
+{
+#ifdef __SWITCH__
+    JitRWBase = memalign(0x1000, JitMemSize);
+
+    JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000;
+    JitRWStart = virtmemReserve(JitMemSize);
+    MemoryInfo info = {0};
+    u32 pageInfo = {0};
+    int i = 0;
+    while (JitRXStart != NULL)
+    {
+        svcQueryMemory(&info, &pageInfo, (u64)JitRXStart);
+        if (info.type != MemType_Unmapped)
+            JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000);
+        else
+            break;
+        if (i++ > 8)
+        {
+            printf("couldn't find unmapped place for jit memory\n");
+            JitRXStart = NULL;
+        }
+    }
+
+    assert(JitRXStart != NULL);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+    assert(succeded);
+
+    SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
+    JitMemUseableSize = JitMemSize;
+    Reset();
+#endif
+
+    for (int i = 0; i < 3; i++)
+    {
+        for (int j = 0; j < 2; j++)
+        {
+            MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
+        }
+    }
+    MemFunc7[0][0] = (void*)NDS::ARM7Read8;
+    MemFunc7[1][0] = (void*)NDS::ARM7Read16;
+    MemFunc7[2][0] = (void*)NDS::ARM7Read32;
+    MemFunc7[0][1] = (void*)NDS::ARM7Write8;
+    MemFunc7[1][1] = (void*)NDS::ARM7Write16;
+    MemFunc7[2][1] = (void*)NDS::ARM7Write32;
+
+    for (int i = 0; i < 2; i++)
+    {
+        for (int j = 0; j < 2; j++)
+        {
+            MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
+            MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
+        }
+    }
+
+    for (int i = 0; i < 3; i++)
+    {
+        JumpToFuncs9[i] = Gen_JumpTo9(i);
+        JumpToFuncs7[i] = Gen_JumpTo7(i);
+    }
+
+    /*
+        W0 - mode
+        W1 - reg num
+        W3 - in/out value of reg
+    */
+    {
+        ReadBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        CMP(W0, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W0, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W0, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W0, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W0, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        RET();
+        SetJumpTarget(irq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        RET();
+        SetJumpTarget(svc);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        RET();
+        SetJumpTarget(abt);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        RET();
+        SetJumpTarget(und);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        RET();
+    }
+    {
+        WriteBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        CMP(W0, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W0, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W0, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W0, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W0, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        MOVI2R(W4, 0);
+        RET();
+
+        SetJumpTarget(fiq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(irq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(svc);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(abt);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(und);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        MOVI2R(W4, 1);
+        RET();
+    }
+
+    //FlushIcache();
+
+    JitMemUseableSize -= GetCodeOffset();
+    SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
+}
+
+Compiler::~Compiler()
+{
+#ifdef __SWITCH__
+    if (JitRWStart != NULL)
+    {
+        bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+        assert(succeded);
+        virtmemFree(JitRWStart, JitMemSize);
+        succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+        assert(succeded);
+        free(JitRWBase);
+    }
+#endif
+}
+
+void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
+{
+    if (reg == 15)
+        MOVI2R(nativeReg, R15);
+    else
+        LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::SaveReg(int reg, ARM64Reg nativeReg)
+{
+    STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+    LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+}
+
+void Compiler::SaveCPSR(bool markClean)
+{
+    if (CPSRDirty)
+    {
+        STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+        CPSRDirty = CPSRDirty && !markClean;
+    }
+}
+
+FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    if (cond >= 0x8)
+    {
+        LSR(W1, RCPSR, 28);
+        MOVI2R(W2, 1);
+        LSLV(W2, W2, W1);
+        ANDI2R(W2, W2, ARM::ConditionTable[cond], W3);
+
+        return CBZ(W2);
+    }
+    else
+    {
+        u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)));
+
+        if (cond & 1)
+            return TBNZ(RCPSR, bit);
+        else
+            return TBZ(RCPSR, bit);
+    }
+}
+
+#define F(x) &Compiler::A_Comp_##x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // EOR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SUB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADD
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SBC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ORR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MOV
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // BIC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MVN
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // TST
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // TEQ
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMP
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMN
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // Mul
+    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, 
+    // ARMv5 exclusives
+    F(Clz), NULL, NULL, NULL, NULL, 
+    
+    // STR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRD
+    NULL, NULL, NULL, NULL,
+    // STRD
+    NULL, NULL, NULL, NULL,
+    // LDRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSB
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // Swap
+    NULL, NULL,
+    // LDM, STM
+    F(LDM_STM), F(LDM_STM),
+    // Branch
+    F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
+    // Special
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+};
+#undef F
+#define F(x) &Compiler::T_Comp_##x
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] =
+{
+    // Shift imm
+    F(ShiftImm), F(ShiftImm), F(ShiftImm),
+    // Add/sub tri operand
+    F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_),
+    // 8 bit imm
+    F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8),
+    // ALU
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    // ALU hi reg
+    F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg),
+    // PC/SP relative ops
+    F(RelAddr), F(RelAddr), F(AddSP),
+    // LDR PC rel
+    F(LoadPCRel),
+    // LDR/STR reg offset
+    F(MemReg), F(MemReg), F(MemReg), F(MemReg),
+    // LDR/STR sign extended, half
+    F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf),
+    // LDR/STR imm offset
+    F(MemImm), F(MemImm), F(MemImm), F(MemImm),
+    // LDR/STR half imm offset
+    F(MemImmHalf), F(MemImmHalf),
+    // LDR/STR sp rel
+    F(MemSPRel), F(MemSPRel),
+    // PUSH/POP
+    F(PUSH_POP), F(PUSH_POP),
+    // LDMIA, STMIA
+    F(LDMIA_STMIA), F(LDMIA_STMIA),
+    // Branch
+    F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2),
+    // Unk, SVC
+    NULL, NULL,
+    F(BL_Merged)
+};
+
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
+void Compiler::Comp_BranchSpecialBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+    {
+        MOVI2R(W0, 1);
+        STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
+    }
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        SaveCPSR(false);
+        RegCache.PrepareExit();
+        ADD(W0, RCycles, ConstantCycles);
+        ABI_PopRegisters(SavedRegs);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+{
+    if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+    {
+        printf("JIT memory full, resetting...\n");
+        ResetBlockCache();
+    }
+
+    JitBlockEntry res = (JitBlockEntry)GetRXPtr();
+
+    Thumb = thumb;
+    Num = cpu->Num;
+    CurCPU = cpu;
+    ConstantCycles = 0;
+    RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
+
+    //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
+    const u32 ALL_CALLEE_SAVED = 0x7FF80000;
+
+    SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
+
+    //if (Num == 1)
+    {
+        ABI_PushRegisters(SavedRegs);
+
+        MOVP2R(RCPU, CurCPU);
+        MOVI2R(RCycles, 0);
+
+        LoadCPSR();
+    }
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
+
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
+        Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
+        //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags);
+
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        {
+            MOVI2R(W0, R15);
+            STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+            if (comp == NULL)
+            {
+                MOVI2R(W0, CurInstr.Instr);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr));
+            }
+            if (Num == 0)
+            {
+                MOVI2R(W0, (s32)CurInstr.CodeCycles);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+            }
+        }
+
+        if (comp == NULL)
+        {
+            SaveCPSR();
+            RegCache.Flush();
+        }
+        else
+            RegCache.Prepare(Thumb, i);
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(X0, RCPU);
+                QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]);
+            }
+            else
+                (this->*comp)();
+        }
+        else
+        {
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM);
+                }
+            }
+            else if (cond == 0xF)
+                Comp_AddCycles_C();
+            else
+            {
+                IrregularCycles = false;
+
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                    skipExecute = CheckCondition(cond);
+
+                if (comp == NULL)
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]);
+                }
+                else
+                {
+                    (this->*comp)();
+                }
+
+                Comp_BranchSpecialBehaviour();
+
+                if (cond < 0xE)
+                {
+                    if (IrregularCycles)
+                    {
+                        FixupBranch skipNop = B();
+                        SetJumpTarget(skipExecute);
+
+                        Comp_AddCycles_C();
+
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            SaveCPSR(false);
+                            RegCache.PrepareExit();
+                            ADD(W0, RCycles, ConstantCycles);
+                            ABI_PopRegisters(SavedRegs);
+                            RET();
+                        }
+
+                        SetJumpTarget(skipNop);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
+                }
+
+            }
+        }
+
+        if (comp == NULL)
+            LoadCPSR();
+    }
+
+    RegCache.Flush();
+
+    //if (Num == 1)
+    {
+        SaveCPSR();
+
+        ADD(W0, RCycles, ConstantCycles);
+
+        ABI_PopRegisters(SavedRegs);
+    }
+    //else
+    //    ADD(RCycles, RCycles, ConstantCycles);
+
+    RET();
+
+    FlushIcache();
+
+    //printf("finished\n");
+
+    return res;
+}
+
+void Compiler::Reset()
+{
+    SetCodePtr(0);
+
+    const u32 brk_0 = 0xD4200000;
+
+    for (int i = 0; i < JitMemUseableSize / 4; i++)
+        *(((u32*)GetRWPtr()) + i) = brk_0;
+}
+
+void Compiler::Comp_AddCycles_C(bool nonConst)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+
+    if (!nonConst && !CurInstr.Info.Branches())
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 numI)
+{
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
+
+    if (Thumb || CurInstr.Cond() >= 0xE)
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
+{
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
+
+    ADD(RCycles, RCycles, numI, shift);
+    if (Thumb || CurInstr.Cond() >= 0xE)
+        ConstantCycles += c;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(RCycles, RCycles, cycles);
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
+        ADD(RCycles, RCycles, cycles);
+    else
+        ConstantCycles += cycles;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..7e13507
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -0,0 +1,234 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../ARM.h"
+#include "../ARMJIT.h"
+
+#include "../dolphin/Arm64Emitter.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMJIT_RegisterCache.h"
+
+namespace ARMJIT
+{
+
+const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27;
+const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28;
+const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29;
+
+struct Op2
+{
+    Op2()
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = Arm64Gen::ST_LSL;
+        Reg.ShiftAmount = 0;
+    }
+
+    Op2(u32 imm) : IsImm(true), Imm(imm)
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = st;
+        Reg.ShiftAmount = amount;
+    }
+
+    Arm64Gen::ArithOption ToArithOption()
+    {
+        assert(!IsImm);
+        return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount);
+    }
+
+    bool IsSimpleReg()
+    { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; }
+    bool ImmFits12Bit()
+    { return IsImm && (Imm & 0xFFF == Imm); }
+    bool IsZero()
+    { return IsImm && !Imm; }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            Arm64Gen::ARM64Reg Rm;
+            Arm64Gen::ShiftType ShiftType;
+            int ShiftAmount;
+        } Reg;
+        u32 Imm;
+    };
+};
+
+class Compiler : Arm64Gen::ARM64XEmitter
+{
+public:
+    typedef void (Compiler::*CompileFunc)();
+
+    Compiler();
+    ~Compiler();
+
+    Arm64Gen::ARM64Reg MapReg(int reg)
+    {
+        assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
+        return RegCache.Mapping[reg];
+    }
+
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+
+    bool CanCompile(bool thumb, u16 kind);
+
+    bool FlagsNZNeeded()
+    {
+        return CurInstr.SetFlags & 0xC;
+    }
+
+    void Reset();
+
+    void Comp_AddCycles_C(bool forceNonConst = false);
+    void Comp_AddCycles_CI(u32 numI);
+    void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
+    void Comp_AddCycles_CD();
+    void Comp_AddCycles_CDI();
+
+    void MovePC();
+
+    void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+    void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+
+    void LoadCPSR();
+    void SaveCPSR(bool markClean = true);
+
+    void A_Comp_ALUTriOp();
+    void A_Comp_ALUMovOp();
+    void A_Comp_ALUCmpOp();
+
+    void A_Comp_Mul();
+    void A_Comp_Mul_Long();
+
+    void A_Comp_Clz();
+
+    void A_Comp_MemWB();
+    void A_Comp_MemHD();
+
+    void A_Comp_LDM_STM();
+    
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
+
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALUImm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
+    void T_Comp_AddSP();
+    void T_Comp_RelAddr();
+
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+
+    void T_Comp_LDMIA_STMIA();
+    void T_Comp_PUSH_POP();
+
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged();
+
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+
+    void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn);
+
+    void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+
+    void Comp_RetriveFlags(bool retriveCV);
+
+    Arm64Gen::FixupBranch CheckCondition(u32 cond);
+
+    void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
+
+    void A_Comp_GetOp2(bool S, Op2& op2);
+
+    void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
+    void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
+
+    void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
+
+    void* Gen_MemoryRoutine9(int size, bool store);
+
+    void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
+    void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
+
+    // 0 = switch mode, 1 = stay arm, 2 = stay thumb
+    void* Gen_JumpTo9(int kind);
+    void* Gen_JumpTo7(int kind);
+
+    void Comp_BranchSpecialBehaviour();
+
+    bool Exit;
+
+    FetchedInstr CurInstr;
+    bool Thumb;
+    u32 R15;
+    u32 Num;
+    ARM* CurCPU;
+    u32 ConstantCycles;
+    u32 CodeRegion;
+
+    BitSet32 SavedRegs;
+
+    u32 JitMemUseableSize;
+
+    void* ReadBanked, *WriteBanked;
+
+    // [size][store]
+    void* MemFunc9[3][2];
+    void* MemFunc7[3][2];
+
+    // [store][pre increment]
+    void* MemFuncsSeq9[2][2];
+    // "[code in main ram]
+    void* MemFuncsSeq7[2][2];
+
+    void* JumpToFuncs9[3];
+    void* JumpToFuncs7[3];
+
+    RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
+
+    bool CPSRDirty = false;
+
+    bool IrregularCycles = false;
+
+#ifdef __SWITCH__
+    void* JitRWBase;
+    void* JitRWStart;
+    void* JitRXStart;
+#endif
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..a5d0e3f
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,848 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../Config.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+// W0 - address
+// (if store) W1 - value to store
+// W2 - code cycles
+void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    u32 addressMask;
+    switch (size)
+    {
+    case 32: addressMask = ~3; break;
+    case 16: addressMask = ~1; break;
+    case 8:  addressMask = ~0; break;
+    }
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
+    SUB(W3, W0, W3);
+    CMP(W3, W4);
+    FixupBranch insideDTCM = B(CC_LO);
+
+    UBFX(W4, W0, 24, 8);
+    CMP(W4, 0x02);
+    FixupBranch outsideMainRAM = B(CC_NEQ);
+    ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
+    MOVP2R(X4, NDS::MainRAM);
+    if (!store && size == 32)
+    {
+        LDR(W3, X3, X4);
+        ANDI2R(W0, W0, 3);
+        LSL(W0, W0, 3);
+        RORV(W0, W3, W0);
+    }
+    else if (store)
+        STRGeneric(size, W1, X3, X4);
+    else
+        LDRGeneric(size, false, W0, X3, X4);
+    RET();
+
+    SetJumpTarget(outsideMainRAM);
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+    CMP(W0, W3);
+    FixupBranch insideITCM = B(CC_LO);
+
+    if (store)
+    {
+        if (size > 8)
+            ANDI2R(W0, W0, addressMask);
+
+        switch (size)
+        {
+        case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
+        case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
+        case 8:  QuickTailCall(X4, NDS::ARM9Write8);  break;
+        }
+    }
+    else
+    {
+        if (size == 32)
+            ABI_PushRegisters({0, 30});
+        if (size > 8)
+            ANDI2R(W0, W0, addressMask);
+
+        switch (size)
+        {
+        case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
+        case 16: QuickTailCall    (X4, NDS::ARM9Read16); break;
+        case 8:  QuickTailCall    (X4, NDS::ARM9Read8 ); break;
+        }
+        if (size == 32)
+        {
+            ABI_PopRegisters({1, 30});
+            ANDI2R(W1, W1, 3);
+            LSL(W1, W1, 3);
+            RORV(W0, W0, W1);
+            RET();
+        }
+    }
+
+    SetJumpTarget(insideDTCM);
+    ANDI2R(W3, W3, 0x3FFF & addressMask);
+    ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
+    if (!store && size == 32)
+    {
+        ANDI2R(W4, W0, 3);
+        LDR(W0, RCPU, W3);
+        LSL(W4, W4, 3);
+        RORV(W0, W0, W4);
+    }
+    else if (store)
+        STRGeneric(size, W1, RCPU, W3);
+    else
+        LDRGeneric(size, false, W0, RCPU, W3);
+    
+    RET();
+
+    SetJumpTarget(insideITCM);
+    ANDI2R(W3, W0, 0x7FFF & addressMask);
+    if (store)
+    {
+        LSR(W0, W3, 8);
+        ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        MOVP2R(X4, CodeRanges);
+        ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4));
+        static_assert(sizeof(AddressRange) == 16);
+        LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
+        FixupBranch null = CBZ(W4);
+        ABI_PushRegisters({1, 3, 30});
+        QuickCallFunction(X4, InvalidateByAddr);
+        ABI_PopRegisters({1, 3, 30});
+        SetJumpTarget(null);
+    }
+    ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
+    if (!store && size == 32)
+    {
+        ANDI2R(W4, W0, 3);
+        LDR(W0, RCPU, W3);
+        LSL(W4, W4, 3);
+        RORV(W0, W0, W4);
+    }
+    else if (store)
+        STRGeneric(size, W1, RCPU, W3);
+    else
+        LDRGeneric(size, false, W0, RCPU, W3);
+    RET();
+
+    return res;
+}
+
+/*
+    W0 - base address
+    X1 - stack space
+    W2 - values count
+*/
+void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+    
+    void* loopStart = GetRXPtr();
+    SUB(W2, W2, 1);
+
+    if (preinc)
+        ADD(W0, W0, 4);
+
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
+    LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
+    SUB(W4, W0, W4);
+    CMP(W4, W5);
+    FixupBranch insideDTCM = B(CC_LO);
+
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
+    CMP(W0, W4);
+    FixupBranch insideITCM = B(CC_LO);
+
+    ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
+    if (store)
+    {
+        LDR(X1, X1, ArithOption(X2, true));
+        QuickCallFunction(X4, NDS::ARM9Write32);
+
+        ABI_PopRegisters({0, 1, 2, 30});
+    }
+    else
+    {
+        QuickCallFunction(X4, NDS::ARM9Read32);
+        MOV(W4, W0);
+
+        ABI_PopRegisters({0, 1, 2, 30});
+
+        STR(X4, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    SetJumpTarget(insideDTCM);
+
+    ANDI2R(W4, W4, ~3 & 0x3FFF);
+    ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
+    if (store)
+    {
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X4);
+    }
+    else
+    {
+        LDR(W5, RCPU, X4);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    SetJumpTarget(insideITCM);
+
+    ANDI2R(W4, W0, ~3 & 0x7FFF);
+
+    if (store)
+    {
+        LSR(W6, W4, 8);
+        ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        MOVP2R(X5, CodeRanges);
+        ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
+        static_assert(sizeof(AddressRange) == 16);
+        LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
+        FixupBranch null = CBZ(W5);
+        ABI_PushRegisters({0, 1, 2, 4, 30});
+        MOV(W0, W6);
+        QuickCallFunction(X5, InvalidateByAddr);
+        ABI_PopRegisters({0, 1, 2, 4, 30});
+        SetJumpTarget(null);
+    }
+
+    ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5);
+    if (store)
+    {
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X4);
+    }
+    else
+    {
+        LDR(W5, RCPU, X4);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    void* loopStart = GetRXPtr();
+    SUB(W2, W2, 1);
+
+    if (preinc)
+        ADD(W0, W0, 4);
+
+    ABI_PushRegisters({0, 1, 2, 30});
+    if (store)
+    {
+        LDR(X1, X1, ArithOption(X2, true));
+        QuickCallFunction(X4, NDS::ARM7Write32);
+        ABI_PopRegisters({0, 1, 2, 30});
+    }
+    else
+    {
+        QuickCallFunction(X4, NDS::ARM7Read32);
+        MOV(W4, W0);
+        ABI_PopRegisters({0, 1, 2, 30});
+        STR(X4, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    return res;
+}
+
+void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+    {
+        CurCPU->DataRead16(addr & ~0x1, &val);
+        if (signExtend)
+            val = ((s32)val << 16) >> 16;
+    }
+    else
+    {
+        CurCPU->DataRead8(addr, &val);
+        if (signExtend)
+            val = ((s32)val << 24) >> 24;
+    }
+    CurCPU->R[15] = tmpR15;
+
+    MOVI2R(MapReg(rd), val);
+
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+}
+
+void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
+{
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
+    
+    if (flags & memop_Store)
+        Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
+        {
+            Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
+            return;
+        }
+    }
+
+    {
+        ARM64Reg rdMapped = MapReg(rd);
+        ARM64Reg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memFunc = Num == 0
+            ? MemFunc9[size >> 4][!!(flags & memop_Store)]
+            : MemFunc7[size >> 4][!!((flags & memop_Store))];
+
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                MOVP2R(X0, ptr);
+                if (flags & memop_Store)
+                    STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
+                else
+                {
+                    LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
+                    if (size == 32 && addr & ~0x3)
+                        ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
+                }
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        ARM64Reg finalAddr = W0;
+        if (flags & memop_Post)
+        {
+            finalAddr = rnMapped;
+            MOV(W0, rnMapped);
+        }
+
+        if (flags & memop_Store)
+            MOV(W1, rdMapped);
+
+        if (!offset.IsImm)
+            Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+        // offset might become an immediate
+        if (offset.IsImm)
+        {
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Imm);
+            else
+                ADD(finalAddr, rnMapped, offset.Imm);
+        }
+        else
+        {
+            if (offset.Reg.ShiftType == ST_ROR)
+            {
+                ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+                offset = Op2(W0);
+            }
+
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+            else
+                ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+        }
+
+        if (!(flags & memop_Post) && (flags & memop_Writeback))
+            MOV(rnMapped, W0);
+
+        if (inlinePreparation)
+        {
+            if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
+                ANDI2R(rdMapped, W0, 3);
+            if (size > 8)
+                ANDI2R(W0, W0, addressMask);
+        }
+        QuickCallFunction(X2, memFunc);
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && !(flags & memop_Store) && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    LSL(rdMapped, rdMapped, 3);
+                    RORV(rdMapped, W0, rdMapped);
+                }
+                else if (constLocalROR32 > 0)
+                    ROR_(rdMapped, W0, constLocalROR32 << 3);
+                else
+                    MOV(rdMapped, W0);
+            }
+            else if (flags & memop_SignExtend)
+            {
+                if (size == 16)
+                    SXTH(rdMapped, W0);
+                else if (size == 8)
+                    SXTB(rdMapped, W0);
+                else
+                    assert("What's wrong with you?");
+            }
+            else
+                MOV(rdMapped, W0);
+            
+            if (CurInstr.Info.Branches())
+            {
+                if (size < 32)
+                    printf("LDR size < 32 branching?\n");
+                Comp_JumpTo(rdMapped, Num == 0, false);
+            }
+        }
+    }
+}
+
+void Compiler::A_Comp_MemWB()
+{
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 25))
+        offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F);
+    else
+        offset = Op2(CurInstr.Instr & 0xFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags);
+}
+
+void Compiler::A_Comp_MemHD()
+{
+    bool load = CurInstr.Instr & (1 << 20);
+    bool signExtend;
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    int size;
+    
+    if (load)
+    {
+        signExtend = op >= 2;
+        size = op == 2 ? 8 : 16;
+    }
+    else
+    {
+        size = 16;
+        signExtend = false;
+    }
+
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 22))
+        offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0));
+    else
+        offset = Op2(MapReg(CurInstr.A_Reg(0)));
+    
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    bool byte = op & 0x1;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), 
+        Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))),
+        size, flags);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
+        load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_LoadPCRel()
+{
+    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+
+    if (Config::JIT_LiteralOptimisations)
+    {
+        Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
+        Comp_AddCycles_CDI();
+    }
+    else
+    {
+        bool negative = addr < R15;
+        u32 abs = negative ? R15 - addr : addr - R15;
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
+    }
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    IrregularCycles = true;
+
+    int regsCount = regs.Count();
+
+    if (regsCount == 0)
+        return 0; // actually not the right behaviour TODO: fix me
+
+    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+    if (store)
+    {
+        Comp_AddCycles_CD();
+
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W0, RCPSR, 0, 5);
+
+        int i = regsCount - 1;
+
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                    MOV(W3, MapReg(reg));
+                else
+                    LoadReg(reg, W3);
+                MOVI2R(W1, reg - 8);
+                BL(ReadBanked);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3;
+                ARM64Reg second = W4;
+
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                    first = MapReg(reg);
+                else
+                    LoadReg(reg, W3);
+
+                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                    second = MapReg(*nextReg);
+                else
+                    LoadReg(*nextReg, W4);
+                
+                STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+                i--;
+                it++;
+            }
+            else if (RegCache.Mapping[reg] != INVALID_REG)
+                STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+            else
+            {
+                LoadReg(reg, W3);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            i--;
+            it++;
+        }
+    }
+    if (decrement)
+    {
+        SUB(W0, MapReg(rn), regsCount * 4);
+        preinc ^= true;
+    }
+    else
+        MOV(W0, MapReg(rn));
+    ADD(X1, SP, 0);
+    MOVI2R(W2, regsCount);
+
+    BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+
+    if (!store)
+    {
+        Comp_AddCycles_CDI();
+
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W0, RCPSR, 0, 5);
+
+        int i = regsCount - 1;
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                MOVI2R(W1, reg - 8);
+                BL(WriteBanked);
+                FixupBranch alreadyWritten = CBNZ(W4);
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                {
+                    MOV(MapReg(reg), W3);
+                    RegCache.DirtyRegs |= 1 << reg;
+                }
+                else
+                    SaveReg(reg, W3);
+                SetJumpTarget(alreadyWritten);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3, second = W4;
+                
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                {
+                    first = MapReg(reg);
+                    if (reg != 15)
+                        RegCache.DirtyRegs |= 1 << reg;
+                }
+                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                {
+                    second = MapReg(*nextReg);
+                    if (*nextReg != 15)
+                        RegCache.DirtyRegs |= 1 << *nextReg;
+                }
+                
+                LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+                if (first == W3)
+                    SaveReg(reg, W3);
+                if (second == W4)
+                    SaveReg(*nextReg, W4);
+
+                it++;
+                i--;
+            }
+            else if (RegCache.Mapping[reg] != INVALID_REG)
+            {
+                ARM64Reg mapped = MapReg(reg);
+                LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
+
+                if (reg != 15)
+                    RegCache.DirtyRegs |= 1 << reg;
+            }
+            else
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                SaveReg(reg, W3);
+            }
+
+            it++;
+            i--;
+        }
+    }
+    ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
+
+    if (!store && regs[15])
+    {
+        ARM64Reg mapped = MapReg(15);
+        Comp_JumpTo(mapped, Num == 0, usermode);
+    }
+
+    return regsCount * 4 * (decrement ? -1 : 1);
+}
+
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
+
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
+
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
+    if (writeback)
+    {
+        if (offset > 0)
+            ADD(rn, rn, offset);
+        else
+            SUB(rn, rn, -offset);
+    }
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    ARM64Reg sp = MapReg(13);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
+
+    if (offset > 0)
+            ADD(sp, sp, offset);
+        else
+            SUB(sp, sp, -offset);
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    ARM64Reg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+    u32 regsCount = regs.Count();
+    
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+    {
+        if (offset > 0)
+            ADD(rb, rb, offset);
+        else
+            SUB(rb, rb, -offset);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 08e2f0a..b884773 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -2,6 +2,8 @@
 
 #include <stdio.h>
 
+#include "Config.h"
+
 namespace ARMInstrInfo
 {
 
@@ -363,7 +365,11 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SpecialKind = special_WriteMem;
         
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+        {
+            if (!Config::JIT_LiteralOptimisations)
+                res.SrcRegs |= 1 << 15;
             res.SpecialKind = special_LoadLiteral;
+        }
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
         {
@@ -417,7 +423,6 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 cp = ((instr >> 8) & 0xF);
             if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
             {
-                printf("happens\n");
                 data = A_UNK;
                 res.Kind = ak_UNK;
             }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 10428aa..8b81ce3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,10 +41,31 @@ if (ENABLE_JIT)
 		ARMJIT_x64/ARMJIT_Branch.cpp
 
 		dolphin/CommonFuncs.cpp
-		dolphin/x64ABI.cpp
-		dolphin/x64CPUDetect.cpp
-		dolphin/x64Emitter.cpp
 	)
+
+	if (ARCHITECTURE STREQUAL x86_64)
+		target_sources(core PRIVATE
+			dolphin/x64ABI.cpp
+			dolphin/x64CPUDetect.cpp
+			dolphin/x64Emitter.cpp
+
+			ARMJIT_x64/ARMJIT_Compiler.cpp
+			ARMJIT_x64/ARMJIT_ALU.cpp
+			ARMJIT_x64/ARMJIT_LoadStore.cpp
+			ARMJIT_x64/ARMJIT_Branch.cpp
+		)
+	endif()
+	if (ARCHITECTURE STREQUAL ARM64)
+		target_sources(core PRIVATE
+			dolphin/Arm64Emitter.cpp
+			dolphin/MathUtil.cpp
+
+			ARMJIT_A64/ARMJIT_Compiler.cpp
+			ARMJIT_A64/ARMJIT_ALU.cpp
+			ARMJIT_A64/ARMJIT_LoadStore.cpp
+			ARMJIT_A64/ARMJIT_Branch.cpp
+		)
+	endif()
 endif()
 
 if (WIN32)
diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h
new file mode 100644
index 0000000..40c4576
--- /dev/null
+++ b/src/dolphin/Align.h
@@ -0,0 +1,24 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace Common
+{
+template <typename T>
+constexpr T AlignUp(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value + (size - value % size) % size);
+}
+
+template <typename T>
+constexpr T AlignDown(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value - value % size);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
new file mode 100644
index 0000000..dbcf425
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -0,0 +1,4466 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#include "Align.h"
+#include "Arm64Emitter.h"
+#include "Assert.h"
+#include "BitUtils.h"
+#include "../types.h"
+#include "MathUtil.h"
+
+namespace Arm64Gen
+{
+namespace
+{
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width)
+{
+  // TODO(jbramley): Optimize this for ARM64 hosts.
+  int count = 0;
+  uint64_t bit_test = 1ULL << (width - 1);
+  while ((count < width) && ((bit_test & value) == 0))
+  {
+    count++;
+    bit_test >>= 1;
+  }
+  return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value)
+{
+  return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift)
+{
+  if (input < 4096)
+  {
+    *val = input;
+    *shift = false;
+    return true;
+  }
+  else if ((input & 0xFFF000) == input)
+  {
+    *val = input >> 12;
+    *shift = true;
+    return true;
+  }
+  return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+                  unsigned int* imm_r)
+{
+  // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+  // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+  bool negate = false;
+
+  // Logical immediates are encoded using parameters n, imm_s and imm_r using
+  // the following table:
+  //
+  //    N   imms    immr    size        S             R
+  //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+  //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+  //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+  //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+  //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+  //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+  // (s bits must not be all set)
+  //
+  // A pattern is constructed of size bits, where the least significant S+1 bits
+  // are set. The pattern is rotated right by R, and repeated across a 32 or
+  // 64-bit value, depending on destination register width.
+  //
+  // Put another way: the basic format of a logical immediate is a single
+  // contiguous stretch of 1 bits, repeated across the whole word at intervals
+  // given by a power of 2. To identify them quickly, we first locate the
+  // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+  // is different for every logical immediate, so it gives us all the
+  // information we need to identify the only logical immediate that our input
+  // could be, and then we simply check if that's the value we actually have.
+  //
+  // (The rotation parameter does give the possibility of the stretch of 1 bits
+  // going 'round the end' of the word. To deal with that, we observe that in
+  // any situation where that happens the bitwise NOT of the value is also a
+  // valid logical immediate. So we simply invert the input whenever its low bit
+  // is set, and then we know that the rotated case can't arise.)
+
+  if (value & 1)
+  {
+    // If the low bit is 1, negate the value, and set a flag to remember that we
+    // did (so that we can adjust the return values appropriately).
+    negate = true;
+    value = ~value;
+  }
+
+  if (width == kWRegSizeInBits)
+  {
+    // To handle 32-bit logical immediates, the very easiest thing is to repeat
+    // the input value twice to make a 64-bit word. The correct encoding of that
+    // as a logical immediate will also be the correct encoding of the 32-bit
+    // value.
+
+    // The most-significant 32 bits may not be zero (ie. negate is true) so
+    // shift the value left before duplicating it.
+    value <<= kWRegSizeInBits;
+    value |= value >> kWRegSizeInBits;
+  }
+
+  // The basic analysis idea: imagine our input word looks like this.
+  //
+  //    0011111000111110001111100011111000111110001111100011111000111110
+  //                                                          c  b    a
+  //                                                          |<--d-->|
+  //
+  // We find the lowest set bit (as an actual power-of-2 value, not its index)
+  // and call it a. Then we add a to our original number, which wipes out the
+  // bottommost stretch of set bits and replaces it with a 1 carried into the
+  // next zero bit. Then we look for the new lowest set bit, which is in
+  // position b, and subtract it, so now our number is just like the original
+  // but with the lowest stretch of set bits completely gone. Now we find the
+  // lowest set bit again, which is position c in the diagram above. Then we'll
+  // measure the distance d between bit positions a and c (using CLZ), and that
+  // tells us that the only valid logical immediate that could possibly be equal
+  // to this number is the one in which a stretch of bits running from a to just
+  // below b is replicated every d bits.
+  uint64_t a = LargestPowerOf2Divisor(value);
+  uint64_t value_plus_a = value + a;
+  uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+  uint64_t value_plus_a_minus_b = value_plus_a - b;
+  uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+  int d, clz_a, out_n;
+  uint64_t mask;
+
+  if (c != 0)
+  {
+    // The general case, in which there is more than one stretch of set bits.
+    // Compute the repeat distance d, and set up a bitmask covering the basic
+    // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+    // of these cases the N bit of the output will be zero.
+    clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+    int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+    d = clz_a - clz_c;
+    mask = ((UINT64_C(1) << d) - 1);
+    out_n = 0;
+  }
+  else
+  {
+    // Handle degenerate cases.
+    //
+    // If any of those 'find lowest set bit' operations didn't find a set bit at
+    // all, then the word will have been zero thereafter, so in particular the
+    // last lowest_set_bit operation will have returned zero. So we can test for
+    // all the special case conditions in one go by seeing if c is zero.
+    if (a == 0)
+    {
+      // The input was zero (or all 1 bits, which will come to here too after we
+      // inverted it at the start of the function), for which we just return
+      // false.
+      return false;
+    }
+    else
+    {
+      // Otherwise, if c was zero but a was not, then there's just one stretch
+      // of set bits in our word, meaning that we have the trivial case of
+      // d == 64 and only one 'repetition'. Set up all the same variables as in
+      // the general case above, and set the N bit in the output.
+      clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+      d = 64;
+      mask = ~UINT64_C(0);
+      out_n = 1;
+    }
+  }
+
+  // If the repeat period d is not a power of two, it can't be encoded.
+  if (!MathUtil::IsPow2<u64>(d))
+    return false;
+
+  // If the bit stretch (b - a) does not fit within the mask derived from the
+  // repeat period, then fail.
+  if (((b - a) & ~mask) != 0)
+    return false;
+
+  // The only possible option is b - a repeated every d bits. Now we're going to
+  // actually construct the valid logical immediate derived from that
+  // specification, and see if it equals our original input.
+  //
+  // To repeat a value every d bits, we multiply it by a number of the form
+  // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+  // be derived using a table lookup on CLZ(d).
+  static const std::array<uint64_t, 6> multipliers = {{
+      0x0000000000000001UL,
+      0x0000000100000001UL,
+      0x0001000100010001UL,
+      0x0101010101010101UL,
+      0x1111111111111111UL,
+      0x5555555555555555UL,
+  }};
+
+  int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+  // Ensure that the index to the multipliers array is within bounds.
+  DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+  uint64_t multiplier = multipliers[multiplier_idx];
+  uint64_t candidate = (b - a) * multiplier;
+
+  // The candidate pattern doesn't match our input value, so fail.
+  if (value != candidate)
+    return false;
+
+  // We have a match! This is a valid logical immediate, so now we have to
+  // construct the bits and pieces of the instruction encoding that generates
+  // it.
+
+  // Count the set bits in our basic stretch. The special case of clz(0) == -1
+  // makes the answer come out right for stretches that reach the very top of
+  // the word (e.g. numbers like 0xffffc00000000000).
+  int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+  int s = clz_a - clz_b;
+
+  // Decide how many bits to rotate right by, to put the low bit of that basic
+  // stretch in position a.
+  int r;
+  if (negate)
+  {
+    // If we inverted the input right at the start of this function, here's
+    // where we compensate: the number of set bits becomes the number of clear
+    // bits, and the rotation count is based on position b rather than position
+    // a (since b is the location of the 'lowest' 1 bit after inversion).
+    s = d - s;
+    r = (clz_b + 1) & (d - 1);
+  }
+  else
+  {
+    r = (clz_a + 1) & (d - 1);
+  }
+
+  // Now we're done, except for having to encode the S output in such a way that
+  // it gives both the number of set bits and the length of the repeated
+  // segment. The s field is encoded like this:
+  //
+  //     imms    size        S
+  //    ssssss    64    UInt(ssssss)
+  //    0sssss    32    UInt(sssss)
+  //    10ssss    16    UInt(ssss)
+  //    110sss     8    UInt(sss)
+  //    1110ss     4    UInt(ss)
+  //    11110s     2    UInt(s)
+  //
+  // So we 'or' (-d << 1) with our computed s to form imms.
+  *n = out_n;
+  *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+  *imm_r = r;
+
+  return true;
+}
+
+float FPImm8ToFloat(u8 bits)
+{
+  const u32 sign = bits >> 7;
+  const u32 bit6 = (bits >> 6) & 1;
+  const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+  const u32 mantissa = (bits & 0xF) << 19;
+  const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+  return Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out)
+{
+  const u32 f = Common::BitCast<u32>(value);
+  const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+  const u32 exponent = (f >> 23) & 0xFF;
+  const u32 sign = f >> 31;
+
+  if ((exponent >> 7) == ((exponent >> 6) & 1))
+    return false;
+
+  const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4;
+  const float new_float = FPImm8ToFloat(imm8);
+  if (new_float == value)
+    *imm_out = imm8;
+  else
+    return false;
+
+  return true;
+}
+}  // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr)
+{
+  m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr)
+{
+  SetCodePtrUnsafe(ptr);
+  m_lastCacheFlushEnd = ptr;
+}
+
+void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase)
+{
+  m_code = 0;
+  m_lastCacheFlushEnd = 0;
+  m_rwbase = rwbase;
+  m_rxbase = rxbase;
+}
+
+ptrdiff_t ARM64XEmitter::GetCodeOffset()
+{
+  return m_code;
+}
+
+const u8* ARM64XEmitter::GetRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+u8* ARM64XEmitter::GetWriteableRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+void* ARM64XEmitter::GetRXPtr()
+{
+  return m_rxbase + m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes)
+{
+  for (u32 i = 0; i < bytes / 4; i++)
+    BRK(0);
+}
+
+ptrdiff_t ARM64XEmitter::AlignCode16()
+{
+  int c = int((u64)m_code & 15);
+  if (c)
+    ReserveCodeSpace(16 - c);
+  return m_code;
+}
+
+ptrdiff_t ARM64XEmitter::AlignCodePage()
+{
+  int c = int((u64)m_code & 4095);
+  if (c)
+    ReserveCodeSpace(4096 - c);
+  return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value)
+{
+  std::memcpy(m_rwbase + m_code, &value, sizeof(u32));
+  m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache()
+{
+  FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code);
+  m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
+{
+  if (start == end)
+    return;
+
+#if defined(IOS)
+  // Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+  sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+  // Don't rely on GCC's __clear_cache implementation, as it caches
+  // icache/dcache cache line sizes, that can vary between cores on
+  // big.LITTLE architectures.
+  u64 addr, ctr_el0;
+  static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+  size_t isize, dsize;
+
+  __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+  isize = 4 << ((ctr_el0 >> 0) & 0xf);
+  dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+  // use the global minimum cache line size
+  icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+  dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+  addr = (u64)start & ~(u64)(dsize - 1);
+  for (; addr < (u64)end; addr += dsize)
+    // use "civac" instead of "cvau", as this is the suggested workaround for
+    // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+    __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+  __asm__ volatile("dsb ish" : : : "memory");
+
+  addr = (u64)start & ~(u64)(isize - 1);
+  for (; addr < (u64)end; addr += isize)
+    __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+  __asm__ volatile("dsb ish" : : : "memory");
+  __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+    {0, 0, 1},  // SVC
+    {0, 0, 2},  // HVC
+    {0, 0, 3},  // SMC
+    {1, 0, 0},  // BRK
+    {2, 0, 0},  // HLT
+    {5, 0, 1},  // DCPS1
+    {5, 0, 2},  // DCPS2
+    {5, 0, 3},  // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+    0x058,  // ADD
+    0x258,  // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+    {0, 0},  // CSEL
+    {0, 1},  // CSINC
+    {1, 0},  // CSINV
+    {1, 1},  // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+    {0, 0},  // RBIT
+    {0, 1},  // REV16
+    {0, 2},  // REV32
+    {0, 3},  // REV64
+    {0, 4},  // CLZ
+    {0, 5},  // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+    0x02,  // UDIV
+    0x03,  // SDIV
+    0x08,  // LSLV
+    0x09,  // LSRV
+    0x0A,  // ASRV
+    0x0B,  // RORV
+    0x10,  // CRC32B
+    0x11,  // CRC32H
+    0x12,  // CRC32W
+    0x14,  // CRC32CB
+    0x15,  // CRC32CH
+    0x16,  // CRC32CW
+    0x13,  // CRC32X (64bit Only)
+    0x17,  // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+    {0, 0},  // MADD
+    {0, 1},  // MSUB
+    {1, 0},  // SMADDL (64Bit Only)
+    {1, 1},  // SMSUBL (64Bit Only)
+    {2, 0},  // SMULH (64Bit Only)
+    {5, 0},  // UMADDL (64Bit Only)
+    {5, 1},  // UMSUBL (64Bit Only)
+    {6, 0},  // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+    {0, 0},  // AND
+    {0, 1},  // BIC
+    {1, 0},  // OOR
+    {1, 1},  // ORN
+    {2, 0},  // EOR
+    {2, 1},  // EON
+    {3, 0},  // ANDS
+    {3, 1},  // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+    {0, 0, 0, 0, 0},  // STXRB
+    {0, 0, 0, 0, 1},  // STLXRB
+    {0, 0, 1, 0, 0},  // LDXRB
+    {0, 0, 1, 0, 1},  // LDAXRB
+    {0, 1, 0, 0, 1},  // STLRB
+    {0, 1, 1, 0, 1},  // LDARB
+    {1, 0, 0, 0, 0},  // STXRH
+    {1, 0, 0, 0, 1},  // STLXRH
+    {1, 0, 1, 0, 0},  // LDXRH
+    {1, 0, 1, 0, 1},  // LDAXRH
+    {1, 1, 0, 0, 1},  // STLRH
+    {1, 1, 1, 0, 1},  // LDARH
+    {2, 0, 0, 0, 0},  // STXR
+    {3, 0, 0, 0, 0},  // (64bit) STXR
+    {2, 0, 0, 0, 1},  // STLXR
+    {3, 0, 0, 0, 1},  // (64bit) STLXR
+    {2, 0, 0, 1, 0},  // STXP
+    {3, 0, 0, 1, 0},  // (64bit) STXP
+    {2, 0, 0, 1, 1},  // STLXP
+    {3, 0, 0, 1, 1},  // (64bit) STLXP
+    {2, 0, 1, 0, 0},  // LDXR
+    {3, 0, 1, 0, 0},  // (64bit) LDXR
+    {2, 0, 1, 0, 1},  // LDAXR
+    {3, 0, 1, 0, 1},  // (64bit) LDAXR
+    {2, 0, 1, 1, 0},  // LDXP
+    {3, 0, 1, 1, 0},  // (64bit) LDXP
+    {2, 0, 1, 1, 1},  // LDAXP
+    {3, 0, 1, 1, 1},  // (64bit) LDAXP
+    {2, 1, 0, 0, 1},  // STLR
+    {3, 1, 0, 0, 1},  // (64bit) STLR
+    {2, 1, 1, 0, 1},  // LDAR
+    {3, 1, 1, 0, 1},  // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+          (((u32)distance << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr)
+{
+  s64 distance = (s64)ptr - s64(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn)
+{
+  Rn = DecodeReg(Rn);
+  Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d",
+             __func__, imm);
+
+  Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+          ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt)
+{
+  Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                         ARM64Reg Rm, ArithOption Option)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+          (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+          Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                              ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rn);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+          (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+                                             CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rm);
+
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+          (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                         CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+          (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+          (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+          Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                       ARM64Reg Ra)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Ra = DecodeReg(Ra);
+  Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+          (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                      ArithOption Shift)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+          (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  if (b64Bit && bitop != 0x2)  // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+    bitop |= 0x1;
+  Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+                                           ARM64Reg Rt)
+{
+  Rs = DecodeReg(Rs);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Rt = DecodeReg(Rt);
+  Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) |
+          (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) |
+          (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                                              u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool b128Bit = IsQuad(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (b128Bit)
+    imm >>= 4;
+  else if (b64Bit)
+    imm >>= 3;
+  else
+    imm >>= 2;
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+  u32 opc = 0;
+  if (b128Bit)
+    opc = 2;
+  else if (b64Bit && bVec)
+    opc = 1;
+  else if (b64Bit && !bVec)
+    opc = 2;
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  u32 offset = imm & 0x1FF;
+
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (size == 64)
+    imm >>= 3;
+  else if (size == 32)
+    imm >>= 2;
+  else if (size == 16)
+    imm >>= 1;
+
+  ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+          (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+                                                  ArithOption Rm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+          (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+                                        ARM64Reg Rd)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+                                         int n)
+{
+  // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+  // Use Rn to determine bitness here.
+  bool b64Bit = Is64Bit(Rn);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                                        ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  u32 type_encode = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (b64Bit)
+  {
+    op |= 0b10;
+    imm >>= 3;
+  }
+  else
+  {
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
+{
+  Rd = DecodeReg(Rd);
+
+  Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+static constexpr bool IsInRangeImm19(s64 distance)
+{
+  return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance)
+{
+  return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance)
+{
+  return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance)
+{
+  return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance)
+{
+  return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance)
+{
+  return distance & 0x3FFFFFF;
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
+{
+  bool Not = false;
+  u32 inst = 0;
+  s64 distance = (s64)(m_code - branch.ptr);
+  distance >>= 2;
+
+  switch (branch.type)
+  {
+  case 1:  // CBNZ
+    Not = true;
+  case 0:  // CBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    bool b64Bit = Is64Bit(branch.reg);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+  }
+  break;
+  case 2:  // B (conditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+    break;
+  case 4:  // TBNZ
+    Not = true;
+  case 3:  // TBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) |
+           (MaskImm14(distance) << 5) | reg;
+  }
+  break;
+  case 5:  // B (uncoditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x5 << 26) | MaskImm26(distance);
+    break;
+  case 6:  // BL (unconditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x25 << 26) | MaskImm26(distance);
+    break;
+  }
+
+  std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 0;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 1;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 2;
+  branch.cond = cond;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 3;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 4;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 5;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::BL()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 6;
+  HINT(HINT_NOP);
+  return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr)
+{
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance),
+             "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr,
+             distance, distance);
+  Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BLR(scratchreg);
+  }
+  else
+  {
+    BL(func);
+  }
+}
+
+void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BR(scratchreg);
+  }
+  else
+  {
+    B(func);
+  }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET()
+{
+  EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS()
+{
+  EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm)
+{
+  EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm)
+{
+  EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm)
+{
+  EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm)
+{
+  EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm)
+{
+  EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm)
+{
+  EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm)
+{
+  EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm)
+{
+  EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm)
+{
+  u32 op1 = 0, op2 = 0;
+  switch (field)
+  {
+  case FIELD_SPSel:
+    op1 = 0;
+    op2 = 5;
+    break;
+  case FIELD_DAIFSet:
+    op1 = 3;
+    op2 = 6;
+    break;
+  case FIELD_DAIFClr:
+    op1 = 3;
+    op2 = 7;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to");
+    break;
+  }
+  EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2)
+{
+  switch (field)
+  {
+  case FIELD_NZCV:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 2;
+    op2 = 0;
+    break;
+  case FIELD_FPCR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 0;
+    break;
+  case FIELD_FPSR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 1;
+    break;
+  case FIELD_PMCR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 6;
+    op2 = 0;
+    break;
+  case FIELD_PMCCNTR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 7;
+    op2 = 0;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to");
+    break;
+  }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+  // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+  EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op)
+{
+  EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX()
+{
+  EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift)
+{
+  ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm)
+{
+  if (IsGPR(Rd) && IsGPR(Rm))
+    ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+  else
+    ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm)
+{
+  ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift)
+{
+  bool sf = Is64Bit(Rd);
+  bool N = sf;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+  SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+  else
+    EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+  EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (Rt, Rn, Rm); break;
+    case 33: LDRSW(Rt, Rn, Rm); break;
+    case 16: LDRH (Rt, Rn, Rm); break;
+    case 17: LDRSH(Rt, Rn, Rm); break;
+    case 8:  LDRB (Rt, Rn, Rm); break;
+    case 9:  LDRSB(Rt, Rn, Rm); break;
+    default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size)
+  {
+    case 32: STR  (Rt, Rn, Rm); break;
+    case 16: STRH (Rt, Rn, Rm); break;
+    case 8:  STRB (Rt, Rn, Rm); break;
+    default: PanicAlert("STRGeneric(reg): invalid size %d", size); break;
+  }
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (type, Rt, Rn, imm); break;
+    case 33: LDRSW(type, Rt, Rn, imm); break;
+    case 16: LDRH (type, Rt, Rn, imm); break;
+    case 17: LDRSH(type, Rt, Rn, imm); break;
+    case 8:  LDRB (type, Rt, Rn, imm); break;
+    case 9:  LDRSB(type, Rt, Rn, imm); break;
+    default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size)
+  {
+    case 32: STR  (type, Rt, Rn, imm); break;
+    case 16: STRH (type, Rt, Rn, imm); break;
+    case 8:  STRB (type, Rt, Rn, imm); break;
+    default: PanicAlert("STRGeneric(imm): invalid size %d", size); break;
+  }
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
+{
+  unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+  BitSet32 upload_part(0);
+
+  // Always start with a movz! Kills the dependency on the register.
+  bool use_movz = true;
+
+  if (!imm)
+  {
+    // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
+    // clearer in disasm too.
+    MOVZ(Rd, 0, SHIFT_0);
+    return;
+  }
+
+  if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+      (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
+  {
+    // Max unsigned value (or if signed, -1)
+    // Set to ~ZR
+    ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+    ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+    return;
+  }
+
+  // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
+  // Small negative integer. Use MOVN
+  if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
+  {
+    MOVN(Rd, ~imm, SHIFT_0);
+    return;
+  }
+
+  // XXX: Use MOVN when possible.
+  // XXX: Optimize more
+  // XXX: Support rotating immediates to save instructions
+  if (optimize)
+  {
+    for (unsigned int i = 0; i < parts; ++i)
+    {
+      if ((imm >> (i * 16)) & 0xFFFF)
+        upload_part[i] = 1;
+    }
+  }
+
+  u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF;
+s64 aligned_offset = (s64)imm - (s64)aligned_pc;
+  // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
+  if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
+  {
+    // Immediate we are loading is within 4GB of our aligned range
+    // Most likely a address that we can load in one or two instructions
+    if (!(std::abs(aligned_offset) & 0xFFF))
+    {
+      // Aligned ADR
+      ADRP(Rd, (s32)aligned_offset);
+      return;
+    }
+    else
+    {
+      // If the address is within 1MB of PC we can load it in a single instruction still
+      s64 offset = (s64)imm - (s64)(m_rxbase + m_code);
+      if (offset >= -0xFFFFF && offset <= 0xFFFFF)
+      {
+        ADR(Rd, (s32)offset);
+        return;
+      }
+      else
+      {
+        ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
+        ADD(Rd, Rd, imm & 0xFFF);
+        return;
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < parts; ++i)
+  {
+    if (use_movz && upload_part[i])
+    {
+      MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+      use_movz = false;
+    }
+    else
+    {
+      if (upload_part[i] || !optimize)
+        MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+    }
+  }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
+{
+  // TODO: Also optimize for performance, not just for code size.
+  ptrdiff_t start_offset = GetCodeOffset();
+
+  MOVI2R(Rd, imm1);
+  int size1 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  MOVI2R(Rd, imm2);
+  int size2 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  bool element = size1 > size2;
+
+  MOVI2R(Rd, element ? imm2 : imm1);
+
+  return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last write to avoid the dependency between those stores.
+
+  // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+  if (num_regs & 1)
+  {
+    STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size);
+  }
+  else
+  {
+    ARM64Reg first_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg second_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size);
+  }
+
+  // Fast store for all other registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // We must adjust the SP in the end, so load the first (two) registers at least.
+  ARM64Reg first = (ARM64Reg)(X0 + *it++);
+  ARM64Reg second;
+  if (!(num_regs & 1))
+    second = (ARM64Reg)(X0 + *it++);
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last load to avoid the dependency between those loads.
+
+  // Fast load for all but the first (two) registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  // Post loading the first (two) registers.
+  if (num_regs & 1)
+    LDR(INDEX_POST, first, SP, stack_size);
+  else
+    LDP(INDEX_POST, first, second, SP, stack_size);
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+                                               ARM64Reg Rn, s32 imm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  u32 encoded_size = 0;
+  u32 encoded_imm = 0;
+
+  if (size == 8)
+    encoded_size = 0;
+  else if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+  else if (size == 128)
+    encoded_size = 0;
+
+  if (type == INDEX_UNSIGNED)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)),
+               "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__,
+               imm, m_emit->GetCodePtr());
+    ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!",
+               __func__);
+    if (size == 16)
+      imm >>= 1;
+    else if (size == 32)
+      imm >>= 2;
+    else if (size == 64)
+      imm >>= 3;
+    else if (size == 128)
+      imm >>= 4;
+    encoded_imm = (imm & 0xFFF);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255),
+               "%s immediate offset must be within range of -256 to 256!", __func__);
+    encoded_imm = (imm & 0x1FF) << 2;
+    if (type == INDEX_POST)
+      encoded_imm |= 1;
+    else
+      encoded_imm |= 3;
+  }
+
+  Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+          (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) |
+          (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                      ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rd);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+          (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+          (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+          (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+                                       ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+                                               bool sign)
+{
+  DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point");
+  if (IsGPR(Rd))
+  {
+    // Use the encoding that transfers the result to a GPR.
+    bool sf = Is64Bit(Rd);
+    int type = IsDouble(Rn) ? 1 : 0;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = (sign ? 1 : 0);
+    int rmode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      rmode = 0;
+      opcode |= 4;
+      break;
+    case ROUND_P:
+      rmode = 1;
+      break;
+    case ROUND_M:
+      rmode = 2;
+      break;
+    case ROUND_Z:
+      rmode = 3;
+      break;
+    case ROUND_N:
+      rmode = 0;
+      break;
+    }
+    EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+  }
+  else
+  {
+    // Use the encoding (vector, single) that keeps the result in the fp register.
+    int sz = IsDouble(Rn);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      opcode = 0x1C;
+      break;
+    case ROUND_N:
+      opcode = 0x1A;
+      break;
+    case ROUND_M:
+      opcode = 0x1B;
+      break;
+    case ROUND_P:
+      opcode = 0x1A;
+      sz |= 2;
+      break;
+    case ROUND_Z:
+      opcode = 0x1B;
+      sz |= 2;
+      break;
+    }
+    Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+            (Rn << 5) | Rd);
+  }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+                                        u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) |
+          (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rn);
+
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+          (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+                                       ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+          (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+  bool quad = IsQuad(Rd);
+
+  u32 encoded_size = 0;
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  bool is_double = !IsSingle(Rd);
+
+  Rd = DecodeReg(Rd);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+          (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                     ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                           ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+                                                       ARM64Reg Rn)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+                                                           ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+          (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+                                           ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+          (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+                                            ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  u32 type_encode = 0;
+  u32 opc = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (size == 128)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 2;
+    imm >>= 4;
+  }
+  else if (size == 64)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 1;
+    imm >>= 3;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 0;
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+                                                      ArithOption Rm)
+{
+  ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+             "%s must contain an extended reg as Rm!", __func__);
+
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  if (load)
+    encoded_op |= 1;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+          Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh)
+{
+  union
+  {
+    u8 hex;
+    struct
+    {
+      unsigned defgh : 5;
+      unsigned abc : 3;
+    };
+  } v;
+  v.hex = abcdefgh;
+  Rd = DecodeReg(Rd);
+  Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) |
+          (1 << 10) | (v.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 1;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 1;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 1;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 1;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 3;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
+{
+  if (IsScalar(Rd) && IsScalar(Rn))
+  {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+    int rmode = 0;
+    int opcode = 6;
+    int sf = 0;
+    if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
+    {
+      // GPR to scalar single
+      opcode |= 1;
+    }
+    else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
+    {
+      // Scalar single to GPR - defaults are correct
+    }
+    else
+    {
+      // TODO
+      ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
+    }
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+  }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                          ARM64Reg Ra, int opcode)
+{
+  int type = isDouble ? 1 : 0;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Ra = DecodeReg(Ra);
+  int o1 = opcode >> 1;
+  int o0 = opcode & 1;
+  m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) |
+                  (Rn << 5) | Rd);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
+{
+  EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+    imm5 = 1;
+  else if (size == 16)
+    imm5 = 2;
+  else if (size == 32)
+    imm5 = 4;
+  else if (size == 64)
+    imm5 = 8;
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2)
+{
+  u32 imm5 = 0, imm4 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index1 << 1;
+    imm4 = index2;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index1 << 2;
+    imm4 = index2 << 1;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index1 << 3;
+    imm4 = index2 << 2;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index1 << 4;
+    imm4 = index2 << 3;
+  }
+
+  EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64),
+             "%s must have a size of 64 when destination is 64bit!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 dst_encoding = 0;
+  u32 src_encoding = 0;
+
+  if (size_to == 16)
+    dst_encoding = 3;
+  else if (size_to == 32)
+    dst_encoding = 0;
+  else if (size_to == 64)
+    dst_encoding = 1;
+
+  if (size_from == 16)
+    src_encoding = 3;
+  else if (size_from == 32)
+    src_encoding = 0;
+  else if (size_from == 64)
+    src_encoding = 1;
+
+  Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = false;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+    EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = true;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+
+    EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (dest_size == 8)
+  {
+    immh = 1;
+  }
+  else if (dest_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (dest_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 0;
+  u8 op = 0;
+  u8 abcdefgh = imm & 0xFF;
+  if (size == 8)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+  }
+  else if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else  // 64
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+    op = 1;
+    cmode = 0xE;
+    abcdefgh = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+      u8 tmp = (imm >> (i << 3)) & 0xFF;
+      ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+      if (tmp == 0xFF)
+        abcdefgh |= (1 << i);
+    }
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 1;
+  u8 op = 1;
+  if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__);
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    int num_regs = registers.Count();
+    m_emit->SUB(SP, SP, num_regs * 16);
+    m_emit->ADD(tmp, SP, 0);
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+
+      // 0 = true
+      // 1 < 4 && registers[i + 1] true!
+      // 2 < 4 && registers[i + 2] true!
+      // 3 < 4 && registers[i + 3] true!
+      // 4 < 4 && registers[i + 4] false!
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+  }
+  else
+  {
+    std::vector<ARM64Reg> pair_regs;
+    for (auto it : registers)
+    {
+      pair_regs.push_back((ARM64Reg)(Q0 + it));
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+  }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+  int num_regs = registers.Count();
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    // The temporary register is only used to indicate that we can use this code path
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+  }
+  else
+  {
+    bool odd = num_regs % 2;
+    std::vector<ARM64Reg> pair_regs;
+    for (int i = 31; i >= 0; --i)
+    {
+      if (!registers[i])
+        continue;
+
+      if (odd)
+      {
+        // First load must be a regular LDR if odd
+        odd = false;
+        LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+      }
+      else
+      {
+        pair_regs.push_back((ARM64Reg)(Q0 + i));
+        if (pair_regs.size() == 2)
+        {
+          LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+          pair_regs.clear();
+        }
+      }
+    }
+  }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (!Is64Bit(Rn))
+    imm &= 0xFFFFFFFF;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    AND(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ORRI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ORR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "EORI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    EOR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDSI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ANDS(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+                                 bool flags)
+{
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, imm, shift);
+    break;
+  case 1:
+    ADDS(Rd, Rn, imm, shift);
+    break;
+  case 2:
+    SUB(Rd, Rn, imm, shift);
+    break;
+  case 3:
+    SUBS(Rd, Rn, imm, shift);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                                    ARM64Reg scratch)
+{
+  bool has_scratch = scratch != INVALID_REG;
+  u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
+  bool neg_neg = negative ? false : true;
+
+  // Fast paths, aarch64 immediate instructions
+  // Try them all first
+  if (imm <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm, false, negative, flags);
+    return;
+  }
+  if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+    return;
+  }
+
+  // ADD+ADD is slower than MOVK+ADD, but inplace.
+  // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+  // As this splits the addition in two parts, this must not be done on setting flags.
+  if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+    AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+    return;
+  }
+  if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+    AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+    return;
+  }
+
+  ASSERT_MSG(DYNA_REC, has_scratch,
+             "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch",
+             (u32)imm);
+
+  negative ^= MOVI2R2(scratch, imm, imm_neg);
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, scratch);
+    break;
+  case 1:
+    ADDS(Rd, Rn, scratch);
+    break;
+  case 2:
+    SUB(Rd, Rn, scratch);
+    break;
+  case 3:
+    SUBS(Rd, Rn, scratch);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    ADD(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    SUB(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    CMP(Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate)
+{
+  ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision");
+  uint8_t imm8;
+  if (value == 0.0)
+  {
+    FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+    if (negate)
+      FNEG(Rd, Rd);
+    // TODO: There are some other values we could generate with the float-imm instruction, like
+    // 1.0...
+  }
+  else if (FPImm8FromFloat(value, &imm8))
+  {
+    FMOV(Rd, imm8);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "Failed to find a way to generate FP immediate %f without scratch", value);
+    if (negate)
+      value = -value;
+
+    const u32 ival = Common::BitCast<u32>(value);
+    m_emit->MOVI2R(scratch, ival);
+    FMOV(Rd, scratch);
+  }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch)
+{
+  // TODO: Make it work with more element sizes
+  // TODO: Optimize - there are shorter solution for many values
+  ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
+  MOVI2F(s, value, scratch);
+  DUP(32, Rd, Rd, 0);
+}
+
+}  // namespace Arm64Gen
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
new file mode 100644
index 0000000..4cb9ff7
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.h
@@ -0,0 +1,1152 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+
+#include "ArmCommon.h"
+#include "Assert.h"
+#include "BitSet.h"
+#include "Compat.h"
+
+namespace Arm64Gen
+{
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg
+{
+  // 32bit registers
+  W0 = 0,
+  W1,
+  W2,
+  W3,
+  W4,
+  W5,
+  W6,
+  W7,
+  W8,
+  W9,
+  W10,
+  W11,
+  W12,
+  W13,
+  W14,
+  W15,
+  W16,
+  W17,
+  W18,
+  W19,
+  W20,
+  W21,
+  W22,
+  W23,
+  W24,
+  W25,
+  W26,
+  W27,
+  W28,
+  W29,
+  W30,
+
+  WSP,  // 32bit stack pointer
+
+  // 64bit registers
+  X0 = 0x20,
+  X1,
+  X2,
+  X3,
+  X4,
+  X5,
+  X6,
+  X7,
+  X8,
+  X9,
+  X10,
+  X11,
+  X12,
+  X13,
+  X14,
+  X15,
+  X16,
+  X17,
+  X18,
+  X19,
+  X20,
+  X21,
+  X22,
+  X23,
+  X24,
+  X25,
+  X26,
+  X27,
+  X28,
+  X29,
+  X30,
+
+  SP,  // 64bit stack pointer
+
+  // VFP single precision registers
+  S0 = 0x40,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6,
+  S7,
+  S8,
+  S9,
+  S10,
+  S11,
+  S12,
+  S13,
+  S14,
+  S15,
+  S16,
+  S17,
+  S18,
+  S19,
+  S20,
+  S21,
+  S22,
+  S23,
+  S24,
+  S25,
+  S26,
+  S27,
+  S28,
+  S29,
+  S30,
+  S31,
+
+  // VFP Double Precision registers
+  D0 = 0x80,
+  D1,
+  D2,
+  D3,
+  D4,
+  D5,
+  D6,
+  D7,
+  D8,
+  D9,
+  D10,
+  D11,
+  D12,
+  D13,
+  D14,
+  D15,
+  D16,
+  D17,
+  D18,
+  D19,
+  D20,
+  D21,
+  D22,
+  D23,
+  D24,
+  D25,
+  D26,
+  D27,
+  D28,
+  D29,
+  D30,
+  D31,
+
+  // ASIMD Quad-Word registers
+  Q0 = 0xC0,
+  Q1,
+  Q2,
+  Q3,
+  Q4,
+  Q5,
+  Q6,
+  Q7,
+  Q8,
+  Q9,
+  Q10,
+  Q11,
+  Q12,
+  Q13,
+  Q14,
+  Q15,
+  Q16,
+  Q17,
+  Q18,
+  Q19,
+  Q20,
+  Q21,
+  Q22,
+  Q23,
+  Q24,
+  Q25,
+  Q26,
+  Q27,
+  Q28,
+  Q29,
+  Q30,
+  Q31,
+
+  // For PRFM(prefetch memory) encoding
+  // This is encoded in the Rt register
+  // Data preload
+  PLDL1KEEP = 0,
+  PLDL1STRM,
+  PLDL2KEEP,
+  PLDL2STRM,
+  PLDL3KEEP,
+  PLDL3STRM,
+  // Instruction preload
+  PLIL1KEEP = 8,
+  PLIL1STRM,
+  PLIL2KEEP,
+  PLIL2STRM,
+  PLIL3KEEP,
+  PLIL3STRM,
+  // Prepare for store
+  PLTL1KEEP = 16,
+  PLTL1STRM,
+  PLTL2KEEP,
+  PLTL2STRM,
+  PLTL3KEEP,
+  PLTL3STRM,
+
+  WZR = WSP,
+  ZR = SP,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg)
+{
+  return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg)
+{
+  return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg)
+{
+  return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg)
+{
+  return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType
+{
+  TYPE_IMM = 0,
+  TYPE_REG,
+  TYPE_IMMSREG,
+  TYPE_RSR,
+  TYPE_MEM
+};
+
+enum ShiftType
+{
+  ST_LSL = 0,
+  ST_LSR = 1,
+  ST_ASR = 2,
+  ST_ROR = 3,
+};
+
+enum IndexType
+{
+  INDEX_UNSIGNED,
+  INDEX_POST,
+  INDEX_PRE,
+  INDEX_SIGNED,  // used in LDP/STP
+};
+
+enum ShiftAmount
+{
+  SHIFT_0 = 0,
+  SHIFT_16 = 1,
+  SHIFT_32 = 2,
+  SHIFT_48 = 3,
+};
+
+enum RoundingMode
+{
+  ROUND_A,  // round to nearest, ties to away
+  ROUND_M,  // round towards -inf
+  ROUND_N,  // round to nearest, ties to even
+  ROUND_P,  // round towards +inf
+  ROUND_Z,  // round towards zero
+};
+
+struct FixupBranch
+{
+  ptrdiff_t ptr;
+  // Type defines
+  // 0 = CBZ (32bit)
+  // 1 = CBNZ (32bit)
+  // 2 = B (conditional)
+  // 3 = TBZ
+  // 4 = TBNZ
+  // 5 = B (unconditional)
+  // 6 = BL (unconditional)
+  u32 type;
+
+  // Used with B.cond
+  CCFlags cond;
+
+  // Used with TBZ/TBNZ
+  u8 bit;
+
+  // Used with Test/Compare and Branch
+  ARM64Reg reg;
+};
+
+enum PStateField
+{
+  FIELD_SPSel = 0,
+  FIELD_DAIFSet,
+  FIELD_DAIFClr,
+  FIELD_NZCV,  // The only system registers accessible from EL0 (user space)
+  FIELD_PMCR_EL0,
+  FIELD_PMCCNTR_EL0,
+  FIELD_FPCR = 0x340,
+  FIELD_FPSR = 0x341,
+};
+
+enum SystemHint
+{
+  HINT_NOP = 0,
+  HINT_YIELD,
+  HINT_WFE,
+  HINT_WFI,
+  HINT_SEV,
+  HINT_SEVL,
+};
+
+enum BarrierType
+{
+  OSHLD = 1,
+  OSHST = 2,
+  OSH = 3,
+  NSHLD = 5,
+  NSHST = 6,
+  NSH = 7,
+  ISHLD = 9,
+  ISHST = 10,
+  ISH = 11,
+  LD = 13,
+  ST = 14,
+  SY = 15,
+};
+
+class ArithOption
+{
+public:
+  enum WidthSpecifier
+  {
+    WIDTH_DEFAULT,
+    WIDTH_32BIT,
+    WIDTH_64BIT,
+  };
+
+  enum ExtendSpecifier
+  {
+    EXTEND_UXTB = 0x0,
+    EXTEND_UXTH = 0x1,
+    EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+    EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+    EXTEND_SXTB = 0x4,
+    EXTEND_SXTH = 0x5,
+    EXTEND_SXTW = 0x6,
+    EXTEND_SXTX = 0x7,
+  };
+
+  enum TypeSpecifier
+  {
+    TYPE_EXTENDEDREG,
+    TYPE_IMM,
+    TYPE_SHIFTEDREG,
+  };
+
+private:
+  ARM64Reg m_destReg;
+  WidthSpecifier m_width;
+  ExtendSpecifier m_extend;
+  TypeSpecifier m_type;
+  ShiftType m_shifttype;
+  u32 m_shift;
+
+public:
+  ArithOption(ARM64Reg Rd, bool index = false)
+  {
+    // Indexed registers are a certain feature of AARch64
+    // On Loadstore instructions that use a register offset
+    // We can have the register as an index
+    // If we are indexing then the offset register will
+    // be shifted to the left so we are indexing at intervals
+    // of the size of what we are loading
+    // 8-bit: Index does nothing
+    // 16-bit: Index LSL 1
+    // 32-bit: Index LSL 2
+    // 64-bit: Index LSL 3
+    if (index)
+      m_shift = 4;
+    else
+      m_shift = 0;
+
+    m_destReg = Rd;
+    m_type = TYPE_EXTENDEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      m_extend = EXTEND_UXTX;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      m_extend = EXTEND_UXTW;
+    }
+    m_shifttype = ST_LSL;
+  }
+  ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
+  {
+    m_destReg = Rd;
+    m_shift = shift;
+    m_shifttype = shift_type;
+    m_type = TYPE_SHIFTEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      if (shift == 64)
+        m_shift = 0;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      if (shift == 32)
+        m_shift = 0;
+    }
+  }
+  TypeSpecifier GetType() const { return m_type; }
+  ARM64Reg GetReg() const { return m_destReg; }
+  u32 GetData() const
+  {
+    switch (m_type)
+    {
+    case TYPE_EXTENDEDREG:
+      return (m_extend << 13) | (m_shift << 10);
+      break;
+    case TYPE_SHIFTEDREG:
+      return (m_shifttype << 22) | (m_shift << 10);
+      break;
+    default:
+      DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
+      break;
+    }
+    return 0;
+  }
+};
+
+class ARM64XEmitter
+{
+  friend class ARM64FloatEmitter;
+
+private:
+  ptrdiff_t m_code;
+  ptrdiff_t m_lastCacheFlushEnd;
+  u8* m_rwbase;
+  u8* m_rxbase;
+
+  void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+  void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+  void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+  void EncodeExceptionInst(u32 instenc, u32 imm);
+  void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+  void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                            ArithOption Option);
+  void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+  void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
+  void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+  void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+  void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+  void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+  void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+  void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                           s32 imm);
+  void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+  void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+  // TODO: make this less ugly
+  // used for Switch where memory is executable and writeable and different addresses
+  // we need to take this for relative addressing in account
+
+  void Write32(u32 value);
+
+public:
+  ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {}
+  ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset)
+  {
+    m_rwbase = rwbase;
+    m_rxbase = rxbase;
+    m_code = offset;
+    m_lastCacheFlushEnd = offset;
+  }
+
+  virtual ~ARM64XEmitter() {}
+  void SetCodePtr(ptrdiff_t ptr);
+  void SetCodePtrUnsafe(ptrdiff_t ptr);
+  void SetCodeBase(u8* rwbase, u8* rxbase);
+  void ReserveCodeSpace(u32 bytes);
+  ptrdiff_t AlignCode16();
+  ptrdiff_t AlignCodePage();
+  ptrdiff_t GetCodeOffset();
+  const u8* GetRWPtr();
+  u8* GetWriteableRWPtr();
+  void* GetRXPtr();
+  void FlushIcache();
+  void FlushIcacheSection(u8* start, u8* end);
+
+  // FixupBranch branching
+  void SetJumpTarget(FixupBranch const& branch);
+  FixupBranch CBZ(ARM64Reg Rt);
+  FixupBranch CBNZ(ARM64Reg Rt);
+  FixupBranch B(CCFlags cond);
+  FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+  FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+  FixupBranch B();
+  FixupBranch BL();
+
+  // Compare and Branch
+  void CBZ(ARM64Reg Rt, const void* ptr);
+  void CBNZ(ARM64Reg Rt, const void* ptr);
+
+  // Conditional Branch
+  void B(CCFlags cond, const void* ptr);
+
+  // Test and Branch
+  void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+  void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+  // Unconditional Branch
+  void B(const void* ptr);
+  void BL(const void* ptr);
+
+  // Unconditional Branch (register)
+  void BR(ARM64Reg Rn);
+  void BLR(ARM64Reg Rn);
+  void RET(ARM64Reg Rn = X30);
+  void ERET();
+  void DRPS();
+
+  // Exception generation
+  void SVC(u32 imm);
+  void HVC(u32 imm);
+  void SMC(u32 imm);
+  void BRK(u32 imm);
+  void HLT(u32 imm);
+  void DCPS1(u32 imm);
+  void DCPS2(u32 imm);
+  void DCPS3(u32 imm);
+
+  // System
+  void _MSR(PStateField field, u8 imm);
+  void _MSR(PStateField field, ARM64Reg Rt);
+  void MRS(ARM64Reg Rt, PStateField field);
+  void CNTVCT(ARM64Reg Rt);
+
+  void HINT(SystemHint op);
+  void CLREX();
+  void DSB(BarrierType type);
+  void DMB(BarrierType type);
+  void ISB(BarrierType type);
+
+  // Add/Subtract (Extended/Shifted register)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+  // Add/Subtract (with carry)
+  void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Conditional Compare (immediate)
+  void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+  // Conditional Compare (register)
+  void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+  // Conditional Select
+  void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Aliases
+  void CSET(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void CSETM(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); }
+  // Data-Processing 1 source
+  void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+  void REV16(ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(ARM64Reg Rd, ARM64Reg Rn);
+  void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+  void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Data-Processing 2 source
+  void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Data-Processing 3 source
+  void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Logical (shifted register)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+  // Wrap the above for saner syntax
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  // Convenience wrappers around ORR. These match the official convenience syntax.
+  void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+  void MOV(ARM64Reg Rd, ARM64Reg Rm);
+  void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+  // Convenience wrappers around UBFM/EXTR.
+  void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+  // Logical (immediate)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); }
+  // Add/subtract (immediate)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+  // Data Processing (Immediate)
+  void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+  // Bitfield move
+  void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+  void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+  // Extract register (ROR with two inputs, if same then faster on A67)
+  void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+  // Aliases
+  void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+
+  void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
+  // Load Register (Literal)
+  void LDR(ARM64Reg Rt, u32 imm);
+  void LDRSW(ARM64Reg Rt, u32 imm);
+  void PRFM(ARM64Reg Rt, u32 imm);
+
+  // Load/Store Exclusive
+  void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+  void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+  void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+  // Load/Store no-allocate pair (offset)
+  void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+  // Load/Store register (immediate indexed)
+  void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store register (register offset)
+  void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Load/Store register (unscaled offset)
+  void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store pair
+  void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Address of label/page PC-relative
+  void ADR(ARM64Reg Rd, s32 imm);
+  void ADRP(ARM64Reg Rd, s32 imm);
+
+  // Wrapper around MOVZ+MOVK
+  void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+  bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+  template <class P>
+  void MOVP2R(ARM64Reg Rd, P* ptr)
+  {
+    ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+    MOVI2R(Rd, (uintptr_t)ptr);
+  }
+
+  // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch
+  // register.
+  void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG)
+  {
+    ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+  }
+  void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                       ARM64Reg scratch);
+  void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+  bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers);
+  void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  // This function expects you to have set up the state.
+  // Overwrites X0 and X30
+  template <typename T, typename... Args>
+  ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
+  {
+    auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+    MOVI2R(X30, (uintptr_t)trampoline);
+    MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+    return X30;
+  }
+
+  void QuickTailCall(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickTailCall(ARM64Reg scratchreg, T func)
+  {
+    QuickTailCall(scratchreg, (const void*)func);
+  }
+
+  // Plain function call
+  void QuickCallFunction(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickCallFunction(ARM64Reg scratchreg, T func)
+  {
+    QuickCallFunction(scratchreg, (const void*)func);
+  }
+};
+
+class ARM64FloatEmitter
+{
+public:
+  ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
+  void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore unscaled
+  void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore single structure
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Loadstore multiple structure
+  void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+  void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+  // Loadstore paired
+  void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  // Loadstore register offset
+  void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Scalar - 1 Source
+  void FABS(ARM64Reg Rd, ARM64Reg Rn);
+  void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+  void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+  void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
+
+  // Scalar - 2 Source
+  void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Scalar - 3 Source. Note - the accumulator is last on ARM!
+  void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+  // Scalar floating point immediate
+  void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+  // Vector
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void NOT(ARM64Reg Rd, ARM64Reg Rn);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
+  void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Move
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+  void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+  // One source
+  void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar convert float to int, in a lot of variants.
+  // Note that the scalar version of this operation has two encodings, one that goes to an integer
+  // register
+  // and one that outputs to a scalar fp register.
+  void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+  void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+  // Scalar convert int to float. No rounding mode specifier necessary.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar fixed point to float. scale is the number of fractional bits.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+  // Float comparison
+  void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMP(ARM64Reg Rn);
+  void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMPE(ARM64Reg Rn);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Conditional select
+  void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Permute
+  void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Shift by immediate
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // vector x indexed element
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+  void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+  // Modified Immediate
+  void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+  void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+  void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+  void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+  void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+  ARM64XEmitter* m_emit;
+  inline void Write32(u32 value) { m_emit->Write32(value); }
+  // Emitting functions
+  void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                         ARM64Reg Rm);
+  void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+  void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn, ARM64Reg Rm);
+  void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
+                       ARM64Reg Rd, ARM64Reg Rn);
+  void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+  void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+                                          ARM64Reg Rm);
+  void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+                          ARM64Reg Rm);
+  void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+  void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+                         int opcode);
+  void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                           ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+}
\ No newline at end of file
diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h
new file mode 100644
index 0000000..6d82e9d
--- /dev/null
+++ b/src/dolphin/ArmCommon.h
@@ -0,0 +1,27 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "../types.h"
+
+enum CCFlags
+{
+  CC_EQ = 0,      // Equal
+  CC_NEQ,         // Not equal
+  CC_CS,          // Carry Set
+  CC_CC,          // Carry Clear
+  CC_MI,          // Minus (Negative)
+  CC_PL,          // Plus
+  CC_VS,          // Overflow
+  CC_VC,          // No Overflow
+  CC_HI,          // Unsigned higher
+  CC_LS,          // Unsigned lower or same
+  CC_GE,          // Signed greater than or equal
+  CC_LT,          // Signed less than
+  CC_GT,          // Signed greater than
+  CC_LE,          // Signed less than or equal
+  CC_AL,          // Always (unconditional) 14
+  CC_HS = CC_CS,  // Alias of CC_CS  Unsigned higher or same
+  CC_LO = CC_CC,  // Alias of CC_CC  Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h
new file mode 100644
index 0000000..8b64a92
--- /dev/null
+++ b/src/dolphin/BitUtils.h
@@ -0,0 +1,254 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+
+namespace Common
+{
+///
+/// Retrieves the size of a type in bits.
+///
+/// @tparam T Type to get the size of.
+///
+/// @return the size of the type in bits.
+///
+template <typename T>
+constexpr size_t BitSize() noexcept
+{
+  return sizeof(T) * CHAR_BIT;
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+/// @param  bit The bit to extract.
+///
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <typename T>
+constexpr T ExtractBit(const T src, const size_t bit) noexcept
+{
+  return (src >> bit) & static_cast<T>(1);
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+///
+/// @tparam bit The bit to extract.
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <size_t bit, typename T>
+constexpr T ExtractBit(const T src) noexcept
+{
+  static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width.");
+
+  return ExtractBit(src, bit);
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+/// @param  begin  The beginning of the bit range. This is inclusive.
+/// @param  end    The ending of the bit range. This is inclusive.
+///
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept
+{
+  return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >>
+                              (BitSize<T>() - end + begin - 1)));
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+///
+/// @tparam begin  The beginning of the bit range. This is inclusive.
+/// @tparam end    The ending of the bit range. This is inclusive.
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src) noexcept
+{
+  static_assert(begin < end, "Beginning bit must be less than the ending bit.");
+  static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width.");
+  static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width.");
+
+  return ExtractBits<T, Result>(src, begin, end);
+}
+
+///
+/// Rotates a value left (ROL).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateLeft(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount)));
+}
+
+///
+/// Rotates a value right (ROR).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateRight(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount)));
+}
+
+///
+/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11.
+/// Both edge cases of all zeros and all ones are considered valid masks, too.
+///
+/// @param  mask The mask value to test for validity.
+///
+/// @tparam T    The type of the value.
+///
+/// @return A bool indicating whether the mask is valid.
+///
+template <typename T>
+constexpr bool IsValidLowMask(const T mask) noexcept
+{
+  static_assert(std::is_integral<T>::value, "Mask must be an integral type.");
+  static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs.");
+
+  // Can be efficiently determined without looping or bit counting. It's the counterpart
+  // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+  // and doesn't require special casing either edge case.
+  return (mask & (mask + 1)) == 0;
+}
+
+///
+/// Reinterpret objects of one type as another by bit-casting between object representations.
+///
+/// @remark This is the example implementation of std::bit_cast which is to be included
+///         in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html
+///         for more details. The only difference is this variant is not constexpr,
+///         as the mechanism for bit_cast requires a compiler built-in to have that quality.
+///
+/// @param source The source object to convert to another representation.
+///
+/// @tparam To   The type to reinterpret source as.
+/// @tparam From The initial type representation of source.
+///
+/// @return The representation of type From as type To.
+///
+/// @pre Both To and From types must be the same size
+/// @pre Both To and From types must satisfy the TriviallyCopyable concept.
+///
+template <typename To, typename From>
+inline To BitCast(const From& source) noexcept
+{
+  static_assert(sizeof(From) == sizeof(To),
+                "BitCast source and destination types must be equal in size.");
+  static_assert(std::is_trivially_copyable<From>(),
+                "BitCast source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<To>(),
+                "BitCast destination type must be trivially copyable.");
+
+  std::aligned_storage_t<sizeof(To), alignof(To)> storage;
+  std::memcpy(&storage, &source, sizeof(storage));
+  return reinterpret_cast<To&>(storage);
+}
+
+template <typename T, typename PtrType>
+class BitCastPtrType
+{
+public:
+  static_assert(std::is_trivially_copyable<PtrType>(),
+                "BitCastPtr source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<T>(),
+                "BitCastPtr destination type must be trivially copyable.");
+
+  explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {}
+
+  // Enable operator= only for pointers to non-const data
+  template <typename S>
+  inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type
+  operator=(const S& source)
+  {
+    std::memcpy(m_ptr, &source, sizeof(source));
+  }
+
+  inline operator T() const
+  {
+    T result;
+    std::memcpy(&result, m_ptr, sizeof(result));
+    return result;
+  }
+
+private:
+  PtrType* m_ptr;
+};
+
+// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs
+// Conversion constructor and operator= provided for a convenient syntax.
+// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr);
+// BitCastPtr<MyStruct>(some_ptr) = s;
+template <typename T, typename PtrType>
+inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType>
+{
+  return BitCastPtrType<T, PtrType>{ptr};
+}
+
+template <typename T>
+void SetBit(T& value, size_t bit_number, bool bit_value)
+{
+  static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types.");
+
+  if (bit_value)
+    value |= (T{1} << bit_number);
+  else
+    value &= ~(T{1} << bit_number);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
index f2f52a5..787d505 100644
--- a/src/dolphin/Compat.h
+++ b/src/dolphin/Compat.h
@@ -61,3 +61,15 @@
     { \
         printf(fmt "\n", ## __VA_ARGS__); \
     } while (false)
+
+#if __cplusplus < 201703L
+// cheat
+namespace std
+{
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+  return v < lo ? lo : (v > hi ? hi : v);
+}
+}
+#endif
\ No newline at end of file
diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp
new file mode 100644
index 0000000..70f2ede
--- /dev/null
+++ b/src/dolphin/MathUtil.cpp
@@ -0,0 +1,13 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "MathUtil.h"
+
+#include <numeric>
+
+// Calculate sum of a float list
+float MathFloatVectorSum(const std::vector<float>& Vec)
+{
+  return std::accumulate(Vec.begin(), Vec.end(), 0.0f);
+}
diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h
new file mode 100644
index 0000000..b1dbbae
--- /dev/null
+++ b/src/dolphin/MathUtil.h
@@ -0,0 +1,121 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "Compat.h"
+
+#include "../types.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace MathUtil
+{
+constexpr double TAU = 6.2831853071795865;
+constexpr double PI = TAU / 2;
+
+template <typename T>
+constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{}))
+{
+  return (T{} < val) - (val < T{});
+}
+
+template <typename T, typename F>
+constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a)
+{
+  return x + (y - x) * a;
+}
+
+template <typename T>
+constexpr bool IsPow2(T imm)
+{
+  return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
+constexpr u32 NextPowerOf2(u32 value)
+{
+  --value;
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  ++value;
+
+  return value;
+}
+
+template <class T>
+struct Rectangle
+{
+  T left{};
+  T top{};
+  T right{};
+  T bottom{};
+
+  constexpr Rectangle() = default;
+
+  constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom)
+      : left(theLeft), top(theTop), right(theRight), bottom(theBottom)
+  {
+  }
+
+  constexpr bool operator==(const Rectangle& r) const
+  {
+    return left == r.left && top == r.top && right == r.right && bottom == r.bottom;
+  }
+
+  T GetWidth() const { return abs(right - left); }
+  T GetHeight() const { return abs(bottom - top); }
+  // If the rectangle is in a coordinate system with a lower-left origin, use
+  // this Clamp.
+  void ClampLL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y2, y1);
+    bottom = std::clamp(bottom, y2, y1);
+  }
+
+  // If the rectangle is in a coordinate system with an upper-left origin,
+  // use this Clamp.
+  void ClampUL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y1, y2);
+    bottom = std::clamp(bottom, y1, y2);
+  }
+};
+
+}  // namespace MathUtil
+
+float MathFloatVectorSum(const std::vector<float>&);
+
+// Rounds down. 0 -> undefined
+inline int IntLog2(u64 val)
+{
+#if defined(__GNUC__)
+  return 63 - __builtin_clzll(val);
+
+#elif defined(_MSC_VER)
+  unsigned long result = ULONG_MAX;
+  _BitScanReverse64(&result, val);
+  return result;
+
+#else
+  int result = -1;
+  while (val != 0)
+  {
+    val >>= 1;
+    ++result;
+  }
+  return result;
+#endif
+}
-- 
cgit v1.2.3


From 899cf97c51578e5c9ea83d3e81b3c7a54595666a Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:33:05 +0100
Subject: apply fixes for aarch64 linux by @nadiaholmquist

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp | 16 ++++++++++++++++
 src/dolphin/Arm64Emitter.cpp       |  2 +-
 src/dolphin/Arm64Emitter.h         |  1 -
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 89d0029..b598ac8 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -8,6 +8,9 @@
 #include "../switch/compat_switch.h"
 
 extern char __start__;
+#else
+#include <sys/mman.h>
+#include <unistd.h>
 #endif
 
 #include <malloc.h>
@@ -34,6 +37,9 @@ template <>
 const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
 
 const int JitMemSize = 16 * 1024 * 1024;
+#ifndef __SWITCH__
+u8 JitMem[JitMemSize];
+#endif
 
 void Compiler::MovePC()
 {
@@ -76,6 +82,16 @@ Compiler::Compiler()
     SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
     JitMemUseableSize = JitMemSize;
     Reset();
+#else
+    #else
+    u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
+    u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned;
+    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+
+    SetCodeBase(pageAligned, pageAligned);
+    JitMemUseableSize = alignedSize;
+    Reset();
 #endif
 
     for (int i = 0; i < 3; i++)
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
index dbcf425..dd2416b 100644
--- a/src/dolphin/Arm64Emitter.cpp
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -8,9 +8,9 @@
 #include <cstring>
 #include <vector>
 
+#include "Compat.h"
 #include "Align.h"
 #include "Arm64Emitter.h"
-#include "Assert.h"
 #include "BitUtils.h"
 #include "../types.h"
 #include "MathUtil.h"
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
index 4cb9ff7..3d9d4ba 100644
--- a/src/dolphin/Arm64Emitter.h
+++ b/src/dolphin/Arm64Emitter.h
@@ -8,7 +8,6 @@
 #include <functional>
 
 #include "ArmCommon.h"
-#include "Assert.h"
 #include "BitSet.h"
 #include "Compat.h"
 
-- 
cgit v1.2.3


From 2dbb9840fb4bba05d14d4df526c6a8ff2051d85c Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:50:16 +0100
Subject: re add error for unsupported JIT platforms

---
 src/ARMJIT.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 561fabb..208801e 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -8,8 +8,10 @@
 #include "ARMJIT_Internal.h"
 #if defined(__x86_64__)
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
-#else
+#elif defined(__aarch64__)
 #include "ARMJIT_A64/ARMJIT_Compiler.h"
+#else
+#error "The current target platform doesn't have a JIT backend"
 #endif
 
 #include "ARMInterpreter_ALU.h"
-- 
cgit v1.2.3


From 42d67c8145fdedc62acc7daa3e756b771b81e7b6 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 19:07:30 +0100
Subject: fix LDM usermode for aarch64 as well

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  | 3 ++-
 src/ARMJIT_A64/ARMJIT_Compiler.h    | 2 ++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index b598ac8..d61cc9c 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -357,7 +357,8 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    F(Nop)
 };
 #undef F
 #define F(x) &Compiler::T_Comp_##x
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 7e13507..5c9ef41 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -103,6 +103,8 @@ public:
     void LoadCPSR();
     void SaveCPSR(bool markClean = true);
 
+    void Nop() {}
+
     void A_Comp_ALUTriOp();
     void A_Comp_ALUMovOp();
     void A_Comp_ALUCmpOp();
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index a5d0e3f..4fd8559 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -639,7 +639,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 if (RegCache.Mapping[reg] != INVALID_REG)
                     MOV(W3, MapReg(reg));
-- 
cgit v1.2.3


From 266fd20ea536e1c2cd98fce49ef23dbd01f3a8cd Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:08:29 +0100
Subject: fixup for aarch64 JIT

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp |  1 -
 src/ARMJIT_RegisterCache.h         | 10 ++++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index d61cc9c..2033307 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -83,7 +83,6 @@ Compiler::Compiler()
     JitMemUseableSize = JitMemSize;
     Reset();
 #else
-    #else
     u64 pageSize = sysconf(_SC_PAGE_SIZE);
     u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
     u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned;
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index b894657..8460825 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -18,11 +18,15 @@ public:
     RegisterCache()
     {}
 
-	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount, bool pcAllocatableAsSrc = false)
 		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
     {
         for (int i = 0; i < 16; i++)
             Mapping[i] = (Reg)-1;
+        
+        PCAllocatableAsSrc = ~(pcAllocatableAsSrc
+            ? 0
+            : (1 << 15));
     }
 
     void UnloadRegister(int reg)
@@ -120,7 +124,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
         u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -184,6 +188,8 @@ public:
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
 
+    u16 PCAllocatableAsSrc = 0;
+
 	T* Compiler;
 
 	FetchedInstr* Instrs;
-- 
cgit v1.2.3


From 2e6e6aa75094bfb3efbae805006249c26c0c4726 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:12:09 +0100
Subject: this it should work

---
 src/ARMJIT_RegisterCache.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 8460825..d4e5539 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -95,6 +95,20 @@ public:
         LiteralsLoaded = 0;
     }
 
+    BitSet32 GetPushRegs()
+    {
+        BitSet16 used;
+        for (int i = 0; i < InstrsCount; i++)
+            used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs);
+
+        BitSet32 res;
+        u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable);
+        for (int i = 0; i < registersMax; i++)
+            res |= BitSet32(1 << (int)NativeRegAllocOrder[i]);
+
+        return res;
+    }
+
 	void Prepare(bool thumb, int i)
     {
         FetchedInstr instr = Instrs[i];
@@ -111,7 +125,7 @@ public:
         for (int j = 0; j < 16; j++)
             ranking[j] = 0;
         for (int j = i; j < InstrsCount; j++)
-        {
+        {s
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
             regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
-- 
cgit v1.2.3


From e9760c941b1e08d4908bf8697e1fa427f6ed8b85 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:21:08 +0100
Subject: git played a prank on me haha very funny

---
 src/ARMJIT_RegisterCache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index d4e5539..5e18e84 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -125,7 +125,7 @@ public:
         for (int j = 0; j < 16; j++)
             ranking[j] = 0;
         for (int j = i; j < InstrsCount; j++)
-        {s
+        {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
             regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
@@ -212,4 +212,4 @@ public:
 
 }
 
-#endif
\ No newline at end of file
+#endif
-- 
cgit v1.2.3


From 05962d9798a065d0a619f31a4013910b9635b5bc Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:34:26 +0100
Subject: the time of good commit names is long gone

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 2033307..513c117 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -357,7 +357,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
     NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-    F(Nop)
+    &Compiler::Nop
 };
 #undef F
 #define F(x) &Compiler::T_Comp_##x
-- 
cgit v1.2.3


From 0280fbe194042e39f050f56faf4796e46d5ebe2d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:43:05 +0100
Subject: this mistake was phenomally stupid

---
 src/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'src')

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8b81ce3..912299d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,10 +35,6 @@ add_library(core STATIC
 if (ENABLE_JIT)
 	target_sources(core PRIVATE
 		ARMJIT.cpp
-		ARMJIT_x64/ARMJIT_Compiler.cpp
-		ARMJIT_x64/ARMJIT_ALU.cpp
-		ARMJIT_x64/ARMJIT_LoadStore.cpp
-		ARMJIT_x64/ARMJIT_Branch.cpp
 
 		dolphin/CommonFuncs.cpp
 	)
-- 
cgit v1.2.3


From 96b8ac1af2f2ac08df25625532e9179f0e75c54c Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Mon, 6 Apr 2020 12:25:35 +0200
Subject: preparations for block linking

---
 src/ARMJIT_Internal.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index fb05f75..b968dcb 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -86,6 +86,14 @@ struct __attribute__((packed)) TinyVector
 		Capacity = capacity;
 	}
 
+	void SetLength(u16 length)
+	{
+		if (Capacity < length)
+			MakeCapacity(length);
+		
+		Length = length;
+	}
+
 	void Clear()
 	{
 		Length = 0;
@@ -147,12 +155,7 @@ public:
 	{
 		NumInstrs = numInstrs;
 		NumAddresses = numAddresses;
-		Data = new u32[numInstrs + numAddresses];
-	}
-
-	~JitBlock()
-	{
-		delete[] Data;
+		Data.SetLength(numInstrs + numAddresses);
 	}
 
 	u32 StartAddr;
@@ -160,13 +163,14 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
+	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
 	u32* Instrs()
-	{ return Data; }
+	{ return &Data[0]; }
 	u32* AddressRanges()
-	{ return Data + NumInstrs; }
+	{ return &Data[NumInstrs]; }
 
 private:
 	/*
@@ -174,7 +178,7 @@ private:
 		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
 			(atleast one, the pseudo physical address of the block)
 	*/
-	u32* Data;
+	TinyVector<u32> Data;
 };
 
 // size should be 16 bytes because I'm to lazy to use mul and whatnot
-- 
cgit v1.2.3


From 59f710158f31b232a7d83cda6cc19b724269a422 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Mon, 6 Apr 2020 12:31:20 +0200
Subject: arm64 fix itcm invalidation and ldm^/stm^

---
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 50 ++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index 4fd8559..6cf710b 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -109,12 +109,12 @@ void* Compiler::Gen_MemoryRoutine9(int size, bool store)
     ANDI2R(W3, W0, 0x7FFF & addressMask);
     if (store)
     {
-        LSR(W0, W3, 8);
-        ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        LSR(W5, W0, 9);
         MOVP2R(X4, CodeRanges);
-        ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4));
+        ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4));
         static_assert(sizeof(AddressRange) == 16);
-        LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
+        LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
         FixupBranch null = CBZ(W4);
         ABI_PushRegisters({1, 3, 30});
         QuickCallFunction(X4, InvalidateByAddr);
@@ -211,34 +211,34 @@ void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
 
     ANDI2R(W4, W0, ~3 & 0x7FFF);
 
+    ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5);
     if (store)
     {
-        LSR(W6, W4, 8);
-        ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X6);
+    }
+    else
+    {
+        LDR(W5, RCPU, X6);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (store)
+    {
+        ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        LSR(W6, W4, 9);
         MOVP2R(X5, CodeRanges);
         ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
         static_assert(sizeof(AddressRange) == 16);
-        LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
+        LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
         FixupBranch null = CBZ(W5);
         ABI_PushRegisters({0, 1, 2, 4, 30});
-        MOV(W0, W6);
+        MOV(W0, W4);
         QuickCallFunction(X5, InvalidateByAddr);
         ABI_PopRegisters({0, 1, 2, 4, 30});
         SetJumpTarget(null);
     }
 
-    ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5);
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X4);
-    }
-    else
-    {
-        LDR(W5, RCPU, X4);
-        STR(X5, X1, ArithOption(X2, true));
-    }
-
     if (!preinc)
         ADD(W0, W0, 4);
     CBNZ(W2, loopStart);
@@ -639,7 +639,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && !regs[15] && reg >= 8 && reg < 15)
+            if (usermode && reg >= 8 && reg < 15)
             {
                 if (RegCache.Mapping[reg] != INVALID_REG)
                     MOV(W3, MapReg(reg));
@@ -663,7 +663,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     second = MapReg(*nextReg);
                 else
                     LoadReg(*nextReg, W4);
-                
+
                 STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
 
                 i--;
@@ -696,7 +696,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     {
         Comp_AddCycles_CDI();
 
-        if (usermode && (regs & BitSet16(0x7f00)))
+        if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
             UBFX(W0, RCPSR, 0, 5);
 
         int i = regsCount - 1;
@@ -708,7 +708,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 LDR(INDEX_UNSIGNED, W3, SP, i * 8);
                 MOVI2R(W1, reg - 8);
@@ -739,7 +739,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     if (*nextReg != 15)
                         RegCache.DirtyRegs |= 1 << *nextReg;
                 }
-                
+
                 LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
 
                 if (first == W3)
-- 
cgit v1.2.3


From 5d0f244f3c86c2b1c65566bffa3972ae1dbac27b Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 16 Apr 2020 16:40:29 +0200
Subject: include more information in DataRegion

---
 src/ARM.h                          | 16 ++++++++--------
 src/ARMJIT_A64/ARMJIT_Compiler.cpp |  4 ++--
 src/ARMJIT_Internal.h              |  7 +++++--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  4 ++--
 src/CP15.cpp                       | 12 ++++++++++++
 5 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'src')

diff --git a/src/ARM.h b/src/ARM.h
index 8282c01..7767095 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -308,7 +308,7 @@ public:
     void DataRead8(u32 addr, u32* val)
     {
         *val = NDS::ARM7Read8(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -317,7 +317,7 @@ public:
         addr &= ~1;
 
         *val = NDS::ARM7Read16(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -326,7 +326,7 @@ public:
         addr &= ~3;
 
         *val = NDS::ARM7Read32(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -341,7 +341,7 @@ public:
     void DataWrite8(u32 addr, u8 val)
     {
         NDS::ARM7Write8(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -350,7 +350,7 @@ public:
         addr &= ~1;
 
         NDS::ARM7Write16(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -359,7 +359,7 @@ public:
         addr &= ~3;
 
         NDS::ARM7Write32(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -390,7 +390,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02) // mainRAM
+        if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 Cycles += numC + numD;
@@ -417,7 +417,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02)
+        if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 Cycles += numC + numD;
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 513c117..00fa436 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02)
+        if ((CurInstr.DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index b968dcb..0d6add9 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -40,9 +40,9 @@ struct FetchedInstr
     u32 Instr;
 	u32 Addr;
 
-    u8 CodeCycles;
 	u8 DataCycles;
-	u8 DataRegion;
+    u16 CodeCycles;
+	u32 DataRegion;
 
     ARMInstrInfo::Info Info;
 };
@@ -195,6 +195,9 @@ typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
+extern u8 MemRegion9[0x80000];
+extern u8 MemRegion7[0x80000];
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 5afe842..d69bdff 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -578,7 +578,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -623,7 +623,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02)
+        if ((CurInstr.DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 10c3b1b..8bb4f6b 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -728,6 +728,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
 
 void ARMv5::DataRead8(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
@@ -747,6 +749,8 @@ void ARMv5::DataRead8(u32 addr, u32* val)
 
 void ARMv5::DataRead16(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
@@ -768,6 +772,8 @@ void ARMv5::DataRead16(u32 addr, u32* val)
 
 void ARMv5::DataRead32(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
@@ -810,6 +816,8 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
 
 void ARMv5::DataWrite8(u32 addr, u8 val)
 {
+    DataRegion = addr >> 12;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
@@ -832,6 +840,8 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
 
 void ARMv5::DataWrite16(u32 addr, u16 val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
@@ -856,6 +866,8 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
 
 void ARMv5::DataWrite32(u32 addr, u32 val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
-- 
cgit v1.2.3


From 3787bab1f69ae22d3e8106d70598ce923e5efe70 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 13:40:51 +0200
Subject: implement block linking + some refactoring currently only supported
 for x64

---
 .gitignore                           |    2 +
 src/ARM.cpp                          |   37 +-
 src/ARM.h                            |   32 +-
 src/ARMJIT.cpp                       |  223 +++-
 src/ARMJIT.h                         |   10 +-
 src/ARMJIT_Internal.h                |   24 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp     |   23 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp   |  140 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h     |   19 +-
 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp |   15 +
 src/ARMJIT_x64/ARMJIT_Linkage.s      |   74 ++
 src/ARMJIT_x64/ARMJIT_Offsets.h      |    3 +
 src/CMakeLists.txt                   |    7 +
 src/Config.cpp                       |    8 +-
 src/Config.h                         |    6 +-
 src/xxhash/xxh3.h                    | 2390 ++++++++++++++++++++++++++++++++++
 src/xxhash/xxhash.c                  |   43 +
 src/xxhash/xxhash.h                  | 1965 ++++++++++++++++++++++++++++
 18 files changed, 4871 insertions(+), 150 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h
 create mode 100644 src/xxhash/xxh3.h
 create mode 100644 src/xxhash/xxhash.c
 create mode 100644 src/xxhash/xxhash.h

(limited to 'src')

diff --git a/.gitignore b/.gitignore
index dd81614..3c87740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ melon_grc.h
 cmake-build
 cmake-build-debug
 .idea
+
+*.exe
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 9ab9546..32cb91c 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -206,15 +206,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (addr & 0x2)
         {
             NextInstr[0] = CodeRead32(addr-2, true) >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
             NextInstr[1] = CodeRead32(addr+2, false);
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
         else
         {
             NextInstr[0] = CodeRead32(addr, true);
             NextInstr[1] = NextInstr[0] >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
 
         CPSR |= 0x20;
@@ -227,9 +227,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (newregion != oldregion) SetupCodeMem(addr);
 
         NextInstr[0] = CodeRead32(addr, true);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
         NextInstr[1] = CodeRead32(addr+4, false);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
 
         CPSR &= ~0x20;
     }
@@ -272,7 +272,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead16(addr);
         NextInstr[1] = CodeRead16(addr+2);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
 
         CPSR |= 0x20;
     }
@@ -285,7 +285,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead32(addr);
         NextInstr[1] = CodeRead32(addr+4);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
 
         CPSR &= ~0x20;
     }
@@ -544,7 +544,7 @@ void ARMv5::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM9Timestamp += Cycles;
+        NDS::ARM9Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -584,14 +584,16 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
 
         if (StopExecution)
         {
@@ -685,7 +687,7 @@ void ARMv4::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM7Timestamp += Cycles;
+        NDS::ARM7Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -725,14 +727,15 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index 7767095..4877956 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -185,14 +185,14 @@ public:
     {
         // code only. always nonseq 32-bit for ARM9.
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC;
+        Cycles -= numC;
     }
 
     void AddCycles_CI(s32 numI)
     {
         // code+internal
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC + numI;
+        Cycles -= numC + numI;
     }
 
     void AddCycles_CDI()
@@ -203,9 +203,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void AddCycles_CD()
@@ -215,9 +215,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void GetCodeMemRegion(u32 addr, NDS::MemRegion* region);
@@ -375,13 +375,13 @@ public:
     void AddCycles_C()
     {
         // code only. this code fetch is sequential.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
     }
 
     void AddCycles_CI(s32 num)
     {
         // code+internal. results in a nonseq code fetch.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
     }
 
     void AddCycles_CDI()
@@ -393,21 +393,21 @@ public:
         if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
             {
                 numC++;
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
             }
         }
         else if (CodeRegion == 0x02)
         {
             numD++;
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD + 1;
+            Cycles -= numC + numD + 1;
         }
     }
 
@@ -420,17 +420,17 @@ public:
         if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else if (CodeRegion == 0x02)
         {
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD;
+            Cycles -= numC + numD;
         }
     }
 };
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 208801e..cc8d4ce 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,10 @@
 
 #include <string.h>
 #include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
 
 #include "Config.h"
 
@@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = {
 u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
-JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-TinyVector<JitBlock*> JitBlocks;
-JitBlock* RestoreCandidates[0x1000] = {NULL};
+std::unordered_map<u32, JitBlock*> JitBlocks;
 
-u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
+template <typename K, typename V, int Size, V InvalidValue>
+struct UnreliableHashTable
 {
-	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
-}
+	struct Bucket
+	{
+		K KeyA, KeyB;
+		V ValA, ValB;
+	};
+
+	Bucket Table[Size];
+
+	void Reset()
+	{
+		for (int i = 0; i < Size; i++)
+		{
+			Table[i].ValA = Table[i].ValB = InvalidValue;
+		}
+	}
+
+	UnreliableHashTable()
+	{
+		Reset();
+	}
+
+	V Insert(K key, V value)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA == value || bucket->ValB == value)
+		{
+			return InvalidValue;
+		}
+		else if (bucket->ValA == InvalidValue)
+		{
+			bucket->KeyA = key;
+			bucket->ValA = value;
+		}
+		else if (bucket->ValB == InvalidValue)
+		{
+			bucket->KeyB = key;
+			bucket->ValB = value;
+		}
+		else
+		{
+			V prevVal = bucket->ValB;
+			bucket->KeyB = bucket->KeyA;
+			bucket->ValB = bucket->ValA;
+			bucket->KeyA = key;
+			bucket->ValA = value;
+			return prevVal;
+		}
+
+		return InvalidValue;
+	}
+
+	void Remove(K key)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->KeyA == key && bucket->ValA != InvalidValue)
+		{
+			bucket->ValA = InvalidValue;
+			if (bucket->ValB != InvalidValue)
+			{
+				bucket->KeyA = bucket->KeyB;
+				bucket->ValA = bucket->ValB;
+				bucket->ValB = InvalidValue;
+			}
+		}
+		if (bucket->KeyB == key && bucket->ValB != InvalidValue)
+			bucket->ValB = InvalidValue;
+	}
+
+	V LookUp(K addr)
+	{
+		u32 slot = XXH3_64bits(&addr, 4) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA != InvalidValue && bucket->KeyA == addr)
+			return bucket->ValA;
+		if (bucket->ValB != InvalidValue && bucket->KeyB == addr)
+			return bucket->ValB;
+
+		return InvalidValue;
+	}
+};
+
+UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
+UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
 
 void Init()
 {
@@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
-		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
@@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu)
 
 			if (staticBranch)
 			{
+				instrs[i].BranchFlags |= branch_StaticTarget;
+
 				bool isBackJump = false;
 				if (hasBranched)
 				{
@@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
-	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
-	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
-	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	if (prevBlock)
 	{
-		RestoreCandidates[restoreSlot] = NULL;	
+		RestoreCandidates.Remove(pseudoPhysicalAddr);
 		if (prevBlock->NumInstrs == i)
 		{
 			for (int j = 0; j < i; j++)
@@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu)
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
-	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
-
-	JitBlocks.Add(block);
+	JitBlocks[pseudoPhysicalAddr] = block;
+	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
 }
 
 void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
@@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 			}
 		}
 
-		bool removed = JitBlocks.RemoveByValue(block);
-		assert(removed);
+		for (int j = 0; j < block->NumLinks(); j++)
+			compiler->UnlinkBlock(block->Links()[j]);
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		JitBlocks.erase(block->PseudoPhysicalAddr);
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
 
 		if (mayRestore)
 		{
-			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-				delete RestoreCandidates[slot];
-
-			RestoreCandidates[slot] = block;
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			if (prevBlock)
+				delete prevBlock;
 		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
@@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr)
 void InvalidateAll()
 {
 	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
-		
-		for (int j = 0; j < block->NumAddresses; j++)
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+
+		for (int i = 0; i < block->NumAddresses; i++)
 		{
-			u32 addr = block->AddressRanges()[j];
+			u32 addr = block->AddressRanges()[i];
 			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
 		}
+		for (int i = 0; i < block->NumLinks(); i++)
+			compiler->UnlinkBlock(block->Links()[i]);
+		block->ResetLinks();
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
-		
-		RestoreCandidates[slot] = block;
+		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+		if (prevBlock)
+			delete prevBlock;
 	}
 
-	JitBlocks.Clear();
+	JitBlocks.clear();
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
-	
-	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
-	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+
+	FastBlockLookUp.Reset();
+	RestoreCandidates.Reset();
+	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
-		if (RestoreCandidates[i])
+		if (RestoreCandidates.Table[i].ValA)
 		{
-			delete RestoreCandidates[i];
-			RestoreCandidates[i] = NULL;
+			delete RestoreCandidates.Table[i].ValA;
+			RestoreCandidates.Table[i].ValA = NULL;
+		}
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValB;
+			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -788,11 +882,43 @@ void ResetBlockCache()
 		}
 		delete block;
 	}
-	JitBlocks.Clear();
+	JitBlocks.clear();
 
 	compiler->Reset();
 }
 
+JitBlockEntry LookUpBlockEntry(u32 addr)
+{
+	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	if (entryOffset != UINT32_MAX)
+		return compiler->AddEntryOffset(entryOffset);
+
+	auto block = JitBlocks.find(addr);
+	if (block != JitBlocks.end())
+	{
+		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		return block->second->EntryPoint;
+	}
+	return NULL;
+}
+
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset)
+{
+	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
+	auto block = JitBlocks.find(targetPseudoPhys);
+	if (block == JitBlocks.end())
+	{
+		CompileBlock(cpu);
+		block = JitBlocks.find(targetPseudoPhys);
+	}
+
+	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
+
+	block->second->AddLink(codeOffset);
+	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
@@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	return NULL;
 }
 
-}
\ No newline at end of file
+}
+
+template void ARMJIT::LinkBlock<0>(ARM*, u32);
+template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 09cc463..cab385f 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000];
 extern u32 AddrTranslate7[0x4000];
 
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
@@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr)
 		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
-template <u32 num>
-inline JitBlockEntry LookUpBlock(u32 addr)
-{
-	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
-}
+JitBlockEntry LookUpBlockEntry(u32 addr);
+
 
 void Init();
 void DeInit();
@@ -73,4 +69,6 @@ void ResetBlockCache();
 
 }
 
+extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
+
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 0d6add9..66d1808 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -15,7 +15,8 @@ enum
 {
 	branch_IdleBranch = 1 << 0,
 	branch_FollowCondTaken = 1 << 1,
-	branch_FollowCondNotTaken = 1 << 2
+	branch_FollowCondNotTaken = 1 << 2,
+	branch_StaticTarget = 1 << 3,
 };
 
 struct FetchedInstr
@@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector
 		assert(capacity > Capacity);
 		T* newMem = new T[capacity];
 		if (Data != NULL)
-			memcpy(newMem, Data, sizeof(Data) * Length);
+			memcpy(newMem, Data, sizeof(T) * Length);
 
 		T* oldData = Data;
 		Data = newMem;
@@ -163,7 +164,6 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
-	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
@@ -171,6 +171,21 @@ public:
 	{ return &Data[0]; }
 	u32* AddressRanges()
 	{ return &Data[NumInstrs]; }
+	u32* Links()
+	{ return &Data[NumInstrs + NumAddresses]; }
+
+	u32 NumLinks()
+	{ return Data.Length - NumInstrs - NumAddresses; }
+
+	void AddLink(u32 link)
+	{
+		Data.Add(link);
+	}
+
+	void ResetLinks()
+	{
+		Data.SetLength(NumInstrs + NumAddresses);
+	}
 
 private:
 	/*
@@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000];
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index e02865d..cac590a 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
@@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     IrregularCycles = true;
 
     BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-    bool previouslyDirty = CPSRDirty;
+    bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
@@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
             LoadReg(reg, RegCache.Mapping[reg]);
     }
 
-    if (previouslyDirty)
-        LoadCPSR();
-    CPSRDirty = previouslyDirty;
+    LoadCPSR();
+    // in case this instruction is skipped
+    if (CurInstr.Cond() < 0xE)
+        CPSRDirty = cpsrDirty;
 }
 
 void Compiler::A_Comp_BranchImm()
@@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_SpecialBranchBehaviour();
+    Comp_SpecialBranchBehaviour(true);
 
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
-    }
+    Comp_SpecialBranchBehaviour(false);
 
     Comp_AddCycles_C(true);
     SetJumpTarget(skipFailed);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d69bdff..be3709e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -1,6 +1,7 @@
 #include "ARMJIT_Compiler.h"
 
 #include "../ARMInterpreter.h"
+#include "../Config.h"
 
 #include <assert.h>
 
@@ -15,6 +16,8 @@
 
 using namespace Gen;
 
+extern "C" void ARM_Ret();
+
 namespace ARMJIT
 {
 template <>
@@ -170,6 +173,24 @@ Compiler::Compiler()
         RET();
     }
 
+    {
+        CPSRDirty = true;
+        BranchStub[0] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<0>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+
+        CPSRDirty = true;
+        BranchStub[1] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<1>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -362,23 +383,43 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-void Compiler::Comp_SpecialBranchBehaviour()
+void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
         RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
+
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
+            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
+        {
+            FixupBranch ret = J_CC(CC_S);
+            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+            FixupBranch ret2 = J_CC(CC_NZ);
+
+            u8* rewritePart = GetWritableCodePtr();
+            NOP(5);
+
+            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+            JMP((u8*)BranchStub[Num], true);
+
+            SetJumpTarget(ret);
+            SetJumpTarget(ret2);
+            JMP((u8*)ARM_Ret, true);
+        }
+        else
+        {
+            JMP((u8*)&ARM_Ret, true);
+        }
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         ResetBlockCache();
@@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     Num = cpu->Num;
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
+    // CPSR might have been modified in a previous block
+    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-
-    MOV(64, R(RCPU), ImmPtr(cpu));
-
-    LoadCPSR();
-
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                 else
                     (this->*comp)();
 
-                Comp_SpecialBranchBehaviour();
+                Comp_SpecialBranchBehaviour(true);
 
                 if (CurInstr.Cond() < 0xE)
                 {
@@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
                         Comp_AddCycles_C(true);
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            RegCache.PrepareExit();
-                            SaveCPSR(false);
-                            
-                            MOV(32, R(RAX), Imm32(ConstantCycles));
-                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-                            RET();
-                        }
+                        Comp_SpecialBranchBehaviour(false);
 
                         SetJumpTarget(skipFailed);
                     }
@@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
             }
         }
 
-        if (comp == NULL && i != instrsCount - 1)
+        if (comp == NULL)
             LoadCPSR();
     }
 
     RegCache.Flush();
-    SaveCPSR();
 
-    MOV(32, R(RAX), Imm32(ConstantCycles));
+    SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+    if (Config::JIT_BrancheOptimisations == 2
+        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
+        && (!instrs[instrsCount - 1].Info.Branches()
+        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
+        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
+    {
+        FixupBranch ret = J_CC(CC_S);
+        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+        FixupBranch ret2 = J_CC(CC_NZ);
+
+        u8* rewritePart = GetWritableCodePtr();
+        NOP(5);
+
+        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+        JMP((u8*)BranchStub[Num], true);
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-    RET();
+        SetJumpTarget(ret);
+        SetJumpTarget(ret2);
+        JMP((u8*)ARM_Ret, true);
+    }
+    else
+    {
+        JMP((u8*)ARM_Ret, true);
+    }
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     return res;
 }
 
+void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    JMP((u8*)entry, true);
+    SetCodePtr(curPtr);
+}
+
+void Compiler::UnlinkBlock(u32 offset)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    NOP(5);
+    SetCodePtr(curPtr);
+}
+
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
@@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (!Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     if (!Thumb && CurInstr.Cond() < 0xE)
     {
         LEA(32, RSCRATCH, MDisp(i, add + cycles));
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
     }
     else
     {
         ConstantCycles += i + cycles;
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
 
@@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+            SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
             ConstantCycles += cycles;
     }
@@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2cb57dc..b428c33 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -51,7 +51,10 @@ public:
 
     void Reset();
 
-    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    void LinkBlock(u32 offset, JitBlockEntry entry);
+    void UnlinkBlock(u32 offset);
+
+    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -145,7 +148,7 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void Comp_SpecialBranchBehaviour();
+    void Comp_SpecialBranchBehaviour(bool taken);
 
     void* Gen_MemoryRoutine9(bool store, int size);
 
@@ -176,12 +179,24 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(ResetStart + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - ResetStart;
+    }
+
     u8* ResetStart;
     u32 CodeMemSize;
 
     bool Exit;
     bool IrregularCycles;
 
+    void* BranchStub[2];
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
new file mode 100644
index 0000000..9696d22
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -0,0 +1,15 @@
+#include "../ARM.h"
+
+int main(int argc, char* argv[])
+{
+    FILE* f = fopen("ARMJIT_Offsets.h", "w");
+#define writeOffset(field) \
+        fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field))
+
+    writeOffset(CPSR);
+    writeOffset(Cycles);
+    writeOffset(StopExecution);
+
+    fclose(f);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..dbbb024
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -0,0 +1,74 @@
+.intel_syntax noprefix
+
+#include "ARMJIT_Offsets.h"
+
+.text
+
+#define RCPU rbp
+#define RCPSR r15d
+
+#ifdef WIN64
+#define ARG1_REG ecx
+#define ARG2_REG edx
+#define ARG3_REG r8d
+#define ARG4_REG r9d
+#define ARG1_REG64 rcx
+#define ARG2_REG64 rdx
+#define ARG3_REG64 r8
+#define ARG4_REG64 r9
+#else
+#define ARG1_REG edi
+#define ARG2_REG esi
+#define ARG3_REG edx
+#define ARG4_REG ecx
+#define ARG1_REG64 rdi
+#define ARG2_REG64 rsi
+#define ARG3_REG64 rdx
+#define ARG4_REG64 rcx
+#endif
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+#ifdef WIN64
+    push rdi
+    push rsi
+#endif
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+    push rbp
+
+#ifdef WIN64
+    sub rsp, 0x28
+#endif
+    mov RCPU, ARG1_REG64
+    mov RCPSR, [RCPU + ARM_CPSR_offset]
+
+    jmp ARG2_REG64
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    mov [RCPU + ARM_CPSR_offset], RCPSR
+
+#ifdef WIN64
+    add rsp, 0x28
+#endif
+
+    pop rbp
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+#ifdef WIN64
+    pop rsi
+    pop rdi
+#endif
+
+    ret
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
new file mode 100644
index 0000000..a73dd59
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -0,0 +1,3 @@
+#define ARM_CPSR_offset 0x64
+#define ARM_Cycles_offset 0xc
+#define ARM_StopExecution_offset 0x10
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 912299d..f650f42 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,9 +30,13 @@ add_library(core STATIC
 	SPU.cpp
 	Wifi.cpp
 	WifiAP.cpp
+
+	xxhash/xxhash.c
 )
 
 if (ENABLE_JIT)
+	enable_language(ASM)
+
 	target_sources(core PRIVATE
 		ARMJIT.cpp
 
@@ -49,7 +53,10 @@ if (ENABLE_JIT)
 			ARMJIT_x64/ARMJIT_ALU.cpp
 			ARMJIT_x64/ARMJIT_LoadStore.cpp
 			ARMJIT_x64/ARMJIT_Branch.cpp
+
+			ARMJIT_x64/ARMJIT_Linkage.s
 		)
+		set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
 	endif()
 	if (ARCHITECTURE STREQUAL ARM64)
 		target_sources(core PRIVATE
diff --git a/src/Config.cpp b/src/Config.cpp
index be6a833..f3f8c6c 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -35,10 +35,10 @@ int GL_ScaleFactor;
 int GL_Antialias;
 
 #ifdef JIT_ENABLED
-bool JIT_Enable = false;
+int JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
-bool JIT_BrancheOptimisations = true;
-bool JIT_LiteralOptimisations = true;
+int JIT_BrancheOptimisations = 2;
+int JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -52,7 +52,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
diff --git a/src/Config.h b/src/Config.h
index 723ab13..fff476a 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -47,10 +47,10 @@ extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
 #ifdef JIT_ENABLED
-extern bool JIT_Enable;
+extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern bool JIT_BrancheOptimisations;
-extern bool JIT_LiteralOptimisations;
+extern int JIT_BrancheOptimisations;
+extern int JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h
new file mode 100644
index 0000000..5d5faf8
--- /dev/null
+++ b/src/xxhash/xxh3.h
@@ -0,0 +1,2390 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file is separated for development purposes.
+ * It will be integrated into `xxhash.h` when development stage is completed.
+ *
+ * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
+ */
+
+#ifndef XXH3_H_1397135465
+#define XXH3_H_1397135465
+
+/* ===   Dependencies   === */
+#ifndef XXHASH_H_5627135585666179
+/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
+#  undef XXH_INLINE_ALL   /* avoid redefinition */
+#  define XXH_INLINE_ALL
+#endif
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0 /* Portable scalar version */
+#define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
+#define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
+#define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
+#define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
+#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator.
+ * This is for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+#  undef vector /* Undo the pollution */
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+ * {
+ *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+ * }
+ */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * We don't need to (or want to) mix as much as XXH64.
+ *
+ * Short hashes are more evenly distributed, so it isn't necessary.
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        xxh_u64 const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len < 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 x = input64 ^ bitflip;
+        /* this mix is inspired by Pelle Evensen's rrmxmx */
+        x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
+        x *= 0x9FB21C651E98DF25ULL;
+        x ^= (x >> 35) + len ;
+        x *= 0x9FB21C651E98DF25ULL;
+        return XXH_xorshift64(x, 28);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(8 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    __asm__ ("" : "+r" (seed64));
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
+
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        if (accWidth == XXH3_acc_128bits) {
+            /* xacc[0] += swap(data_vec); */
+            __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        } else {  /* XXH3_acc_64bits */
+            /* xacc[0] += data_vec; */
+            __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+                __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+#ifdef __s390x__
+            xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
+#else
+            xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+        size_t i;
+        for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+#ifdef __clang__ // for clang
+#  define XXH_PREFETCH_DIST_AVX512_64  320
+#  define XXH_PREFETCH_DIST_AVX512_128 320
+#else // for gcc
+#  define XXH_PREFETCH_DIST_AVX512_64  640
+#  define XXH_PREFETCH_DIST_AVX512_128 512
+#endif
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+#if (XXH_VECTOR == XXH_AVX512)
+        if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
+        else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
+#else
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+#endif
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+            /* Do not align on 8, so that the secret is different from the scrambler */
+#define XXH_SECRET_LASTACC_START 7
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        __asm__("" : "+r" (result64));
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+    /*
+     * We need a separate pointer for the hack below.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8 *kSecretPtr = kSecret;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that kSecretPtr has been changed), the pipelines are used more efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    __asm__("" : "+r" (kSecretPtr));
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == kSecret);
+
+    for (i=0; i < nbRounds; i++) {
+        /*
+         * The asm hack causes Clang to assume that kSecretPtr aliases with
+         * customSecret, and on aarch64, this prevented LDP from merging two
+         * loads together for free. Putting the loads together before the stores
+         * properly generates LDP.
+         */
+        xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+        xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+        XXH_writeLE64(customSecret + 16*i,     lo);
+        XXH_writeLE64(customSecret + 16*i + 8, hi);
+    }
+}
+
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * There is some input left inside the internal buffer.
+         * Fill it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* Consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* Some remaining input: buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc,
+                              state->secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX: short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t h128;
+        h128.low64  = XXH3_avalanche(mixedl);
+        h128.high64 = XXH3_avalanche(mixedh);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
+            h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+         return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         state->secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         state->secret + state->secretLimit + STRIPE_LEN
+                                                       - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH3_H_1397135465 */
diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c
new file mode 100644
index 0000000..0fae88c
--- /dev/null
+++ b/src/xxhash/xxhash.c
@@ -0,0 +1,43 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h
new file mode 100644
index 0000000..67a5887
--- /dev/null
+++ b/src/xxhash/xxhash.h
@@ -0,0 +1,1965 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+/*!
+ * XXH_NAMESPACE, aka Namespace Emulation:
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * XXH32():
+ *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ *  The memory between input & input+length must be valid (allocated and read-accessible).
+ *  "seed" can be used to alter the result predictably.
+ *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*!
+ * XXH64():
+ * Returns the 64-bit hash of sequence of length @length stored at memory
+ * address @input.
+ * @seed can be used to alter the result predictably.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation of an XXH
+ * state, for example, on the stack or in a struct.
+ * Never **ever** access members directly.
+ */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * The XXH3 algorithm is still in development.
+ * The results it produces may still change in future versions.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * Avoid storing values in long-term storage until the algorithm is finalized.
+ *
+ * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
+ * everything remains fine, its current format will be "frozen" and become the
+ * final one.
+ *
+ * After which, return values of XXH3 and XXH128 will no longer change in
+ * future versions.
+ *
+ * XXH3's return values will be officially finalized upon reaching v0.8.0.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional
+ * collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid trivial sequences, such as repeating sequences and especially '\0',
+ * as this can cancel out itself.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly based on the default
+ * secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from the seed. Makes state larger.
+   * Design might change */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   /* note: there is some padding after due to alignment on 64 bytes */
+   const unsigned char* secret;
+};   /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever possible.
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with the default parameters.
+ * The result will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, and must outlive the hash streaming session, so
+ * be careful when using stack arrays.
+ * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         <0 if *h128_1  < *h128_2
+ *         =0 if *h128_1 == *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be found in xxhash.c.
+ *
+ * However, code inlining requires the implementation to be visible to the
+ * compiler, usually within the header.
+ *
+ * As a workaround, xxhash.c used to be included within xxhash.h. This caused
+ * some issues with some build systems, especially ones which treat .c files
+ * as source files.
+ *
+ * Therefore, the implementation is now directly integrated within xxhash.h.
+ * Another small advantage is that xxhash.c is no longer needed in /include.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!
+ * XXH_FORCE_MEMORY_ACCESS:
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow to select a different access method for improved
+ * performance.
+ * Method 0 (default):
+ *     Use `memcpy()`. Safe and portable.
+ * Method 1:
+ *     `__attribute__((packed))` statement. It depends on compiler extensions
+ *     and is therefore not portable.
+ *     This method is safe if your compiler supports it, and *generally* as
+ *     fast or faster than `memcpy`.
+ * Method 2:
+ *     Direct access via cast. This method doesn't depend on the compiler but
+ *     violates the C standard.
+ *     It can generate buggy code on targets which do not support unaligned
+ *     memory accesses.
+ *     But in some circumstances, it's the only known way to get the most
+ *     performance (ie GCC + ARMv6)
+ * Method 3:
+ *     Byteshift. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction.
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!
+ *XXH_ACCEPT_NULL_INPUT_POINTER:
+ * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+ * triggering a segfault.
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!
+ * XXH_FORCE_ALIGN_CHECK:
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means: check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * Set it to 0 when the input is guaranteed to be aligned or when alignment
+ * doesn't matter for performance.
+ *
+ * This option does not affect XXH3.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!
+ * XXH_NO_INLINE_HINTS:
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+/*!
+ * XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang.
+ */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*!
+ * Modify the local functions below should you wish to use some other memory
+ * routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free(void* p) { free(p); }
+
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS /* disable inlining hints */
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#elif defined(_MSC_VER)    /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) \
+    || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*
+ * DEBUGLEVEL is expected to be defined externally, typically via the compiler's
+ * command line options. The value must be a number.
+ */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/*!
+ * XXH_CPU_LITTLE_ENDIAN:
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Nonstandard, but well-defined behavior in practice.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \
+                               && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+     * loop (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize.
+     */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 do {                           \
+    h32 += (*ptr++) * PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1;      \
+} while (0)
+
+#define PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*!
+ * XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+ * performance gain on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+ * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+ * to unroll. The code becomes ridiculously large (the largest function in the
+ * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+ * also slightly faster because it fits into cache better and is more likely
+ * to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+ */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 do {                                   \
+    h64 ^= (*ptr++) * PRIME64_5;                           \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;                 \
+} while (0)
+
+#define PROCESS4_64 do {                                   \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1;      \
+    ptr += 4;                                              \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;     \
+} while (0)
+
+#define PROCESS8_64 do {                                   \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr += 8;                                              \
+    h64 ^= k1;                                             \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;     \
+} while (0)
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
-- 
cgit v1.2.3


From 68d552074bf2c1989d96a8c28cc3f6fe1e6c8b8e Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 14:42:37 +0200
Subject: compile UMULLs and some fixes

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 33 +++++++++++++++++++++++++--------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  4 ++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  3 ++-
 4 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 14c223b..43b94b6 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -301,10 +301,11 @@ void Compiler::A_Comp_MUL_MLA()
     Comp_MulOp(S, add, rd, rm, rs, rn);
 }
 
-void Compiler::A_Comp_SMULL_SMLAL()
+void Compiler::A_Comp_Mul_Long()
 {
     bool S = CurInstr.Instr & (1 << 20);
     bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
     OpArg rd = MapReg(CurInstr.A_Reg(16));
     OpArg rm = MapReg(CurInstr.A_Reg(0));
     OpArg rs = MapReg(CurInstr.A_Reg(8));
@@ -318,18 +319,34 @@ void Compiler::A_Comp_SMULL_SMLAL()
         MOV(32, R(RSCRATCH3), rs);
         TEST(32, R(RSCRATCH3), R(RSCRATCH3));
         FixupBranch zeroBSR = J_CC(CC_Z);
-        BSR(32, RSCRATCH2, R(RSCRATCH3));
-        NOT(32, R(RSCRATCH3));
-        BSR(32, RSCRATCH, R(RSCRATCH3));
-        CMP(32, R(RSCRATCH2), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        if (sign)
+        {
+            BSR(32, RSCRATCH2, R(RSCRATCH3));
+            NOT(32, R(RSCRATCH3));
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+            CMP(32, R(RSCRATCH2), R(RSCRATCH));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        }
+        else
+        {
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+        }
+        
         SHR(32, R(RSCRATCH), Imm8(3));
         SetJumpTarget(zeroBSR); // fortunately that's even right
         Comp_AddCycles_CI(RSCRATCH, 2);
     }
 
-    MOVSX(64, 32, RSCRATCH2, rm);
-    MOVSX(64, 32, RSCRATCH3, rs);
+    if (sign)
+    {
+        MOVSX(64, 32, RSCRATCH2, rm);
+        MOVSX(64, 32, RSCRATCH3, rs);
+    }
+    else
+    {
+        MOV(32, R(RSCRATCH2), rm);
+        MOV(32, R(RSCRATCH3), rs);
+    }
     if (add)
     {
         MOV(32, R(RSCRATCH), rd);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index be3709e..1b2d312 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -300,7 +300,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // CMN
     F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
     // Mul
-    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL,
+    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL,
     // ARMv5 stuff
     F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
     // STR
@@ -628,7 +628,7 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
     else
     {
-        ConstantCycles += i + cycles;
+        ConstantCycles += cycles;
         SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index b428c33..a448b6d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -89,7 +89,7 @@ public:
     void A_Comp_CmpOp();
 
     void A_Comp_MUL_MLA();
-    void A_Comp_SMULL_SMLAL();
+    void A_Comp_Mul_Long();
 
     void A_Comp_CLZ();
     
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 4cafc1c..7f6fa53 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -423,7 +423,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 
                 if (flags & memop_SubtractOffset)
                 {
-                    MOV(32, R(finalAddr), rnMapped);
+                    if (R(finalAddr) != rnMapped)
+                        MOV(32, R(finalAddr), rnMapped);
                     if (!offset.IsZero())
                         SUB(32, R(finalAddr), offset);
                 }
-- 
cgit v1.2.3


From a9dd6e30adc590e11e3a076c1245f1b0b48f27f6 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 19:35:40 +0200
Subject: implement msr and mrs for the x64 JIT

---
 src/ARMJIT.cpp                     |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 127 ++++++++++++++++++++++++++++++++++++-
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   3 +
 src/ARM_InstrInfo.cpp              |   4 ++
 4 files changed, 134 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index cc8d4ce..46f71f1 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -824,7 +824,7 @@ void InvalidateITCM(u32 addr)
 
 void InvalidateAll()
 {
-	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size());
 	for (auto it : JitBlocks)
 	{
 		JitBlock* block = it.second;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 1b2d312..52a16dc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -38,6 +38,131 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(ReadBanked);
+        MOV(32, rd, R(ABI_PARAM3));
+    }
+    else
+        MOV(32, rd, R(RCPSR));
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    OpArg val = CurInstr.Instr & (1 << 25)
+        ? Imm32(ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)))
+        : MapReg(CurInstr.A_Reg(0));
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(ReadBanked);
+
+        MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00));
+        MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF));
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        CMP(32, R(RSCRATCH), Imm8(0x10));
+        CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE);
+        AND(32, R(RSCRATCH2), Imm32(mask));
+
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        NOT(32, R(RSCRATCH));
+        AND(32, R(ABI_PARAM3), R(RSCRATCH));
+
+        AND(32, R(RSCRATCH2), val);
+        OR(32, R(ABI_PARAM3), R(RSCRATCH2));
+
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            AND(32, R(RCPSR), Imm32(~mask));
+            if (val.IsImm())
+            {
+                MOV(32, R(RSCRATCH), val);
+                AND(32, R(RSCRATCH), Imm32(mask));
+                OR(32, R(RCPSR), R(RSCRATCH));
+            }
+            else
+            {
+                OR(32, R(RCPSR), Imm32(val.Imm32() & mask));
+            }
+        }
+        else
+        {
+            MOV(32, R(RSCRATCH2), Imm32(mask));
+            MOV(32, R(RSCRATCH3), R(RSCRATCH2));
+            AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00));
+            MOV(32, R(RSCRATCH), R(RCPSR));
+            AND(32, R(RSCRATCH), Imm8(0x1F));
+            CMP(32, R(RSCRATCH), Imm8(0x10));
+            CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E);
+
+            MOV(32, R(RSCRATCH3), R(RCPSR));
+
+            // I need you ANDN
+            MOV(32, R(RSCRATCH), R(RSCRATCH2));
+            NOT(32, R(RSCRATCH));
+            AND(32, R(RCPSR), R(RSCRATCH));
+
+            AND(32, R(RSCRATCH2), val);
+            OR(32, R(RCPSR), R(RSCRATCH2));
+
+            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+            if (Thumb || CurInstr.Cond() >= 0xE)
+                RegCache.Flush();
+            else
+            {
+                // the ugly way...
+                // we only save them, to load and save them again
+                for (int reg : hiRegsLoaded)
+                    SaveReg(reg, RegCache.Mapping[reg]);
+            }
+
+            MOV(32, R(ABI_PARAM3), R(RCPSR));
+            MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
+            MOV(64, R(ABI_PARAM1), R(RCPU));
+            CALL((void*)&ARM::UpdateMode);
+
+            if (!Thumb && CurInstr.Cond() < 0xE)
+            {
+                for (int reg : hiRegsLoaded)
+                    LoadReg(reg, RegCache.Mapping[reg]);
+            }
+        }
+    }
+}
+
 /*
     We'll repurpose this .bss memory
 
@@ -328,7 +453,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
     // system stuff
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL,
     F(Nop)
 };
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a448b6d..2230eb8 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -100,6 +100,9 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+    void A_Comp_MRS();
+    void A_Comp_MSR();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b884773..28362d9 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -427,6 +427,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
                 res.Kind = ak_UNK;
             }
         }
+        if (res.Kind == ak_MRS && !(instr & (1 << 22)))
+            res.ReadFlags |= flag_N | flag_Z | flag_C | flag_V;
+        if ((res.Kind == ak_MSR_IMM || res.Kind == ak_MSR_REG) && instr & (1 << 19))
+            res.WriteFlags |= flag_N | flag_Z | flag_C | flag_V;
 
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
-- 
cgit v1.2.3


From 59c8d3976562ec3ed057f21116b76a3a532bc4d1 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 16:17:16 +0200
Subject: hopefully fix stack handling for linux

---
 src/ARMJIT_x64/ARMJIT_Linkage.s | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
index dbbb024..0a84df0 100644
--- a/src/ARMJIT_x64/ARMJIT_Linkage.s
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -44,6 +44,8 @@ ARM_Dispatch:
 
 #ifdef WIN64
     sub rsp, 0x28
+#else
+    sub rsp, 0x8
 #endif
     mov RCPU, ARG1_REG64
     mov RCPSR, [RCPU + ARM_CPSR_offset]
@@ -58,6 +60,8 @@ ARM_Ret:
 
 #ifdef WIN64
     add rsp, 0x28
+#else
+    add rsp, 0x8
 #endif
 
     pop rbp
-- 
cgit v1.2.3


From 47b44a6be81c122eac6fba7903d0ad0e6726ffc3 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 16:27:26 +0200
Subject: fix build with JIT disabled and set default JIT maxblock size to 32

---
 src/ARM.cpp        | 2 ++
 src/CMakeLists.txt | 2 +-
 src/CP15.cpp       | 4 ++++
 src/Config.cpp     | 4 ++--
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 32cb91c..95d2b8b 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -127,6 +127,7 @@ void ARM::DoSavestate(Savestate* file)
     file->VarArray(R_IRQ, 3*sizeof(u32));
     file->VarArray(R_UND, 3*sizeof(u32));
     file->Var32(&CurInstr);
+#ifdef JIT_ENABLED
     if (!file->Saving && Config::JIT_Enable)
     {
         // hack, the JIT doesn't really pipeline
@@ -134,6 +135,7 @@ void ARM::DoSavestate(Savestate* file)
         // loaded while running the interpreter
         FillPipeline();
     }
+#endif
     file->VarArray(NextInstr, 2*sizeof(u32));
 
     file->Var32(&ExceptionBase);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f650f42..c92a21d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,7 +10,6 @@ add_library(core STATIC
 	ARMInterpreter_ALU.cpp
 	ARMInterpreter_Branch.cpp
 	ARMInterpreter_LoadStore.cpp
-	ARM_InstrInfo.cpp
 	Config.cpp
 	CP15.cpp
 	CRC32.cpp
@@ -39,6 +38,7 @@ if (ENABLE_JIT)
 
 	target_sources(core PRIVATE
 		ARMJIT.cpp
+		ARM_InstrInfo.cpp
 
 		dolphin/CommonFuncs.cpp
 	)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 8bb4f6b..62258e9 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -561,11 +561,15 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+#ifdef JIT_ENABLED
         ARMJIT::InvalidateAll();
+#endif
         ICacheInvalidateAll();
         return;
     case 0x751:
+#ifdef JIT_ENABLED
         ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
+#endif
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
diff --git a/src/Config.cpp b/src/Config.cpp
index f3f8c6c..7971e5a 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -36,7 +36,7 @@ int GL_Antialias;
 
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
-int JIT_MaxBlockSize = 12;
+int JIT_MaxBlockSize = 32;
 int JIT_BrancheOptimisations = 2;
 int JIT_LiteralOptimisations = true;
 #endif
@@ -51,7 +51,7 @@ ConfigEntry ConfigFile[] =
 
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
-    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
     {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
-- 
cgit v1.2.3


From b0b9ec42e42d491a90352aea040eb6ffb319cdf9 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 20:47:36 +0200
Subject: don't use param registers for ReadBanked/WriteBanked should fix linux
 build

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 64 ++++++++++++++++++-------------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  1 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++-----
 3 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 52a16dc..8d20425 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -48,10 +48,10 @@ void Compiler::A_Comp_MRS()
     {
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
-        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(ReadBanked);
-        MOV(32, rd, R(ABI_PARAM3));
+        MOV(32, rd, R(RSCRATCH3));
     }
     else
         MOV(32, rd, R(RCPSR));
@@ -75,28 +75,26 @@ void Compiler::A_Comp_MSR()
     {
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
-        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(ReadBanked);
 
-        MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00));
-        MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF));
+        MOV(32, R(RSCRATCH2), Imm32(mask));
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00));
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
         CMP(32, R(RSCRATCH), Imm8(0x10));
-        CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE);
-        AND(32, R(RSCRATCH2), Imm32(mask));
+        CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E);
 
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        NOT(32, R(RSCRATCH));
-        AND(32, R(ABI_PARAM3), R(RSCRATCH));
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        NOT(32, R(RSCRATCH4));
+        AND(32, R(RSCRATCH3), R(RSCRATCH4));
 
         AND(32, R(RSCRATCH2), val);
-        OR(32, R(ABI_PARAM3), R(RSCRATCH2));
+        OR(32, R(RSCRATCH3), R(RSCRATCH2));
 
-        MOV(32, R(RSCRATCH), R(RCPSR));
-        AND(32, R(RSCRATCH), Imm8(0x1F));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(WriteBanked);
     }
     else
@@ -219,13 +217,13 @@ Compiler::Compiler()
 
     {
         // RSCRATCH mode
-        // ABI_PARAM2 reg number
-        // ABI_PARAM3 value in current mode
-        // ret - ABI_PARAM3
+        // RSCRATCH2 reg number
+        // RSCRATCH3 value in current mode
+        // ret - RSCRATCH3
         ReadBanked = (void*)GetWritableCodePtr();
         CMP(32, R(RSCRATCH), Imm8(0x11));
         FixupBranch fiq = J_CC(CC_E);
-        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
         FixupBranch notEverything = J_CC(CC_L);
         CMP(32, R(RSCRATCH), Imm8(0x12));
         FixupBranch irq = J_CC(CC_E);
@@ -239,30 +237,30 @@ Compiler::Compiler()
         RET();
 
         SetJumpTarget(fiq);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)));
         RET();
         SetJumpTarget(irq);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)));
         RET();
         SetJumpTarget(svc);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)));
         RET();
         SetJumpTarget(abt);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)));
         RET();
         SetJumpTarget(und);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
     }
     {
         // RSCRATCH  mode
-        // ABI_PARAM2 reg n
-        // ABI_PARAM3 value
+        // RSCRATCH2 reg n
+        // RSCRATCH3 value
         // carry flag set if the register isn't banked
         WriteBanked = (void*)GetWritableCodePtr();
         CMP(32, R(RSCRATCH), Imm8(0x11));
         FixupBranch fiq = J_CC(CC_E);
-        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
         FixupBranch notEverything = J_CC(CC_L);
         CMP(32, R(RSCRATCH), Imm8(0x12));
         FixupBranch irq = J_CC(CC_E);
@@ -277,23 +275,23 @@ Compiler::Compiler()
         RET();
 
         SetJumpTarget(fiq);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(irq);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(svc);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(abt);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(und);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3));
         CLC();
         RET();
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2230eb8..e0a4978 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -16,6 +16,7 @@ const Gen::X64Reg RCPSR = Gen::R15;
 const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+const Gen::X64Reg RSCRATCH4 = Gen::R8;
 
 struct ComplexOperand
 {
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 7f6fa53..85a3737 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -540,14 +540,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                         AND(32, R(RSCRATCH), Imm8(0x1F));
                         firstUserMode = false;
                     }
-                    MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
-                    POP(ABI_PARAM3);
+                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                    POP(RSCRATCH3);
                     CALL(WriteBanked);
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
                     if (RegCache.Mapping[reg] != INVALID_REG)
-                        MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
+                        MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
                     else
-                        SaveReg(reg, ABI_PARAM3);
+                        SaveReg(reg, RSCRATCH3);
                     SetJumpTarget(sucessfulWritten);
                 }
                 else if (RegCache.Mapping[reg] == INVALID_REG)
@@ -600,12 +600,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     firstUserMode = false;
                 }
                 if (RegCache.Mapping[reg] == INVALID_REG)
-                    LoadReg(reg, ABI_PARAM3);
+                    LoadReg(reg, RSCRATCH3);
                 else
-                    MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg]));
-                MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                    MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
+                MOV(32, R(RSCRATCH2), Imm32(reg - 8));
                 CALL(ReadBanked);
-                PUSH(ABI_PARAM3);
+                PUSH(RSCRATCH3);
             }
             else if (RegCache.Mapping[reg] == INVALID_REG)
             {
-- 
cgit v1.2.3


From bcc4b5c8dda5ec91127808a525e2b7dbda41a4f3 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 23:25:32 +0200
Subject: fix regression from last commit also a small mistake with msr

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  2 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 8d20425..dd20e3c 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -105,7 +105,7 @@ void Compiler::A_Comp_MSR()
         if ((mask & 0xFF) == 0)
         {
             AND(32, R(RCPSR), Imm32(~mask));
-            if (val.IsImm())
+            if (!val.IsImm())
             {
                 MOV(32, R(RSCRATCH), val);
                 AND(32, R(RSCRATCH), Imm32(mask));
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 85a3737..b595e32 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -502,14 +502,6 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     int regsCount = regs.Count();
 
-    if (decrement)
-    {
-        MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-        preinc ^= true;
-    }
-    else
-        MOV(32, R(ABI_PARAM1), MapReg(rn));
-
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
     // we need to make sure that the stack stays aligned to 16 bytes
@@ -519,6 +511,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     {
         Comp_AddCycles_CDI();
 
+        if (decrement)
+        {
+            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
+            preinc ^= true;
+        }
+        else
+            MOV(32, R(ABI_PARAM1), MapReg(rn));
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -618,6 +618,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
         }
 
+        if (decrement)
+        {
+            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
+            preinc ^= true;
+        }
+        else
+            MOV(32, R(ABI_PARAM1), MapReg(rn));
+
         MOV(64, R(ABI_PARAM2), R(RSP));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
-- 
cgit v1.2.3


From 0f53a34551d60964345debb1766f81ca4686eb17 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 9 May 2020 00:45:05 +0200
Subject: rewrite JIT memory emulation

---
 src/ARM.cpp                         |  10 +-
 src/ARM.h                           |  24 +-
 src/ARMJIT.cpp                      | 905 +++++++++++++++++++++++++---------
 src/ARMJIT.h                        |  65 ++-
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  |   4 +-
 src/ARMJIT_Internal.h               |  68 ++-
 src/ARMJIT_RegisterCache.h          |  18 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  34 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 935 +++++++++++++++++++-----------------
 src/ARM_InstrInfo.cpp               |  16 +-
 src/CP15.cpp                        |  44 +-
 src/NDS.cpp                         | 105 +++-
 src/NDS.h                           |   8 +
 14 files changed, 1465 insertions(+), 814 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 95d2b8b..205332d 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -579,7 +579,8 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped<0>(instrAddr))
+        u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr);
+        if (!translatedAddr)
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
@@ -589,7 +590,7 @@ void ARMv5::ExecuteJIT()
         // hack so Cycles <= 0 becomes Cycles < 0
         Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
@@ -722,7 +723,8 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped<1>(instrAddr))
+        u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr);
+        if (!translatedAddr)
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
@@ -731,7 +733,7 @@ void ARMv4::ExecuteJIT()
 
         Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
diff --git a/src/ARM.h b/src/ARM.h
index 4877956..f64b7fe 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -308,7 +308,7 @@ public:
     void DataRead8(u32 addr, u32* val)
     {
         *val = NDS::ARM7Read8(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -317,7 +317,7 @@ public:
         addr &= ~1;
 
         *val = NDS::ARM7Read16(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -326,7 +326,7 @@ public:
         addr &= ~3;
 
         *val = NDS::ARM7Read32(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -341,7 +341,7 @@ public:
     void DataWrite8(u32 addr, u8 val)
     {
         NDS::ARM7Write8(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -350,7 +350,7 @@ public:
         addr &= ~1;
 
         NDS::ARM7Write16(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -359,7 +359,7 @@ public:
         addr &= ~3;
 
         NDS::ARM7Write32(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -390,7 +390,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if ((DataRegion >> 4) == 0x02) // mainRAM
+        if ((DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 Cycles -= numC + numD;
@@ -417,7 +417,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if ((DataRegion >> 4) == 0x02)
+        if ((DataRegion >> 24) == 0x02)
         {
             if (CodeRegion == 0x02)
                 Cycles -= numC + numD;
@@ -443,4 +443,12 @@ void T_UNK(ARM* cpu);
 
 }
 
+namespace NDS
+{
+
+extern ARMv5* ARM9;
+extern ARMv4* ARM7;
+
+}
+
 #endif // ARM_H
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 46f71f1..9602aed 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -23,6 +23,7 @@
 #include "ARMInterpreter_Branch.h"
 #include "ARMInterpreter.h"
 
+#include "GPU.h"
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
@@ -34,9 +35,10 @@ namespace ARMJIT
 #define JIT_DEBUGPRINT(msg, ...)
 //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
-Compiler* compiler;
+Compiler* JITCompiler;
 
-const u32 ExeMemRegionSizes[] = {
+const u32 ExeMemRegionSizes[] =
+{
 	0x8000,			// Unmapped Region (dummy)
 	0x8000, 		// ITCM
 	4*1024*1024, 	// Main RAM
@@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = {
 	0x40000			// ARM7 WVRAM
 };
 
-const u32 ExeMemRegionOffsets[] = {
+const u32 ExeMemRegionOffsets[] =
+{
 	0,
 	0x8000,
 	0x10000,
@@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = {
 	0x518000,
 };
 
-#define DUP2(x) x, x
-
-const static ExeMemKind JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(exeMem_ITCM),
-		/* 1X*/	DUP2(exeMem_ITCM), // mirror
-		/* 2X*/	DUP2(exeMem_MainRAM),
-		/* 3X*/	DUP2(exeMem_SWRAM),
-		/* 4X*/	DUP2(exeMem_Unmapped),
-		/* 5X*/	DUP2(exeMem_Unmapped),
-		/* 6X*/		 exeMem_Unmapped, 
-					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(exeMem_Unmapped),
-		/* 8X*/	DUP2(exeMem_Unmapped),
-		/* 9X*/	DUP2(exeMem_Unmapped),
-		/* AX*/	DUP2(exeMem_Unmapped),
-		/* BX*/	DUP2(exeMem_Unmapped),
-		/* CX*/	DUP2(exeMem_Unmapped),
-		/* DX*/	DUP2(exeMem_Unmapped),
-		/* EX*/	DUP2(exeMem_Unmapped),
-		/* FX*/	DUP2(exeMem_ARM9_BIOS)
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
-		/* 1X*/	DUP2(exeMem_Unmapped),
-		/* 2X*/	DUP2(exeMem_MainRAM),
-		/* 3X*/	     exeMem_SWRAM,
-		             exeMem_ARM7_WRAM,
-		/* 4X*/	DUP2(exeMem_Unmapped),
-		/* 5X*/	DUP2(exeMem_Unmapped),
-		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(exeMem_Unmapped),
-		/* 8X*/	DUP2(exeMem_Unmapped),
-		/* 9X*/	DUP2(exeMem_Unmapped),
-		/* AX*/	DUP2(exeMem_Unmapped),
-		/* BX*/	DUP2(exeMem_Unmapped),
-		/* CX*/	DUP2(exeMem_Unmapped),
-		/* DX*/	DUP2(exeMem_Unmapped),
-		/* EX*/	DUP2(exeMem_Unmapped),
-		/* FX*/	DUP2(exeMem_Unmapped)
-		}
-};
-
-#undef DUP2
-
 /*
 	translates address to pseudo physical address
 		- more compact, eliminates mirroring, everything comes in a row
 		- we only need one translation table
 */
-u32 AddrTranslate9[0x2000];
-u32 AddrTranslate7[0x4000];
+
+u32 TranslateAddr9(u32 addr)
+{
+	switch (ClassifyAddress9(addr))
+	{
+	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
+	case memregion_SWRAM9:
+		if (NDS::SWRAM_ARM9)
+			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask);
+		else
+			return 0;
+	case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF);
+	case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0;
+	case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF);
+	default: return 0;
+	}
+}
+
+u32 TranslateAddr7(u32 addr)
+{
+	switch (ClassifyAddress7(addr))
+	{
+	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
+	case memregion_SWRAM7:
+		if (NDS::SWRAM_ARM7)
+			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask);
+		else
+			return 0;
+	case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr;
+	case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF);
+	case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF);
+	default: return 0;
+	}
+}
 
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-std::unordered_map<u32, JitBlock*> JitBlocks;
+TinyVector<u32> InvalidLiterals;
+
+std::unordered_map<u32, JitBlock*> JitBlocks9;
+std::unordered_map<u32, JitBlock*> JitBlocks7;
+
+u8 MemoryStatus9[0x800000];
+u8 MemoryStatus7[0x800000];
+
+int ClassifyAddress9(u32 addr)
+{
+	if (addr < NDS::ARM9->ITCMSize)
+		return memregion_ITCM;
+	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+		return memregion_DTCM;
+	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		return memregion_BIOS9;
+	else
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x02000000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			return memregion_SWRAM9;
+		case 0x04000000:
+			return memregion_IO9;
+		case 0x06000000:
+			return memregion_VRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+int ClassifyAddress7(u32 addr)
+{
+	if (addr < 0x00004000)
+		return memregion_BIOS7;
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x02000000:
+		case 0x02800000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM7)
+				return memregion_SWRAM7;
+			else
+				return memregion_WRAM7;
+		case 0x03800000:
+			return memregion_WRAM7;
+		case 0x04000000:
+			return memregion_IO7;
+		case 0x04800000:
+			return memregion_Wifi;
+		case 0x06000000:
+		case 0x06800000:
+			return memregion_VWRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+void UpdateMemoryStatus9(u32 start, u32 end)
+{
+	start >>= 12;
+	end >>= 12;
+
+	if (end == 0xFFFFF)
+		end++;
+
+	for (u32 i = start; i < end; i++)
+	{
+		u32 addr = i << 12;
+
+		int region = ClassifyAddress9(addr);
+		u32 pseudoPhyisical = TranslateAddr9(addr);
+
+		for (u32 j = 0; j < 8; j++)
+		{
+			u8 val = region;
+			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
+				val |= 0x80;
+			MemoryStatus9[i * 8 + j] = val;
+		}
+	}
+}
+
+void UpdateMemoryStatus7(u32 start, u32 end)
+{
+	start >>= 12;
+	end >>= 12;
+
+	if (end == 0xFFFFF)
+		end++;
+
+	for (u32 i = start; i < end; i++)
+	{
+		u32 addr = i << 12;
+
+		int region = ClassifyAddress7(addr);
+		u32 pseudoPhyisical = TranslateAddr7(addr);
+
+		for (u32 j = 0; j < 8; j++)
+		{
+			u8 val = region;
+			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
+				val |= 0x80;
+			MemoryStatus7[i * 8 + j] = val;
+		}
+	}
+}
+
+void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate)
+{
+	for (u32 i = 1; i < exeMem_Count; i++)
+	{
+		if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i])
+		{
+			for (u32 num = 0; num < 2; num++)
+			{
+				u32 physSize = ExeMemRegionSizes[i];
+				u32 mapSize = 0;
+				u32 mapStart = 0;
+				switch (i)
+				{
+				case exeMem_ITCM:
+					if (num == 0)
+						mapStart = 0; mapSize = NDS::ARM9->ITCMSize;
+					break;
+				case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break;
+				case exeMem_SWRAM:
+					if (num == 0)
+					{
+						if (NDS::SWRAM_ARM9)
+							mapStart = 0x3000000, mapSize = 0x1000000;
+						else
+							mapStart = mapSize = 0;
+					}
+					else
+					{
+						if (NDS::SWRAM_ARM7)
+							mapStart = 0x3000000, mapSize = 0x800000;
+						else
+							mapStart = mapSize = 0;
+					}
+					break;
+				case exeMem_LCDC:
+					if (num == 0)
+						mapStart = 0x6800000, mapSize = 0xA4000;
+					break;
+				case exeMem_ARM9_BIOS:
+					if (num == 0)
+						mapStart = 0xFFFF0000, mapSize = 0x10000;
+					break;
+				case exeMem_ARM7_BIOS:
+					if (num == 1)
+						mapStart = 0; mapSize = 0x4000;
+					break;
+				case exeMem_ARM7_WRAM:
+					if (num == 1)
+					{
+						if (NDS::SWRAM_ARM7)
+							mapStart = 0x3800000, mapSize = 0x800000;
+						else
+							mapStart = 0x3000000, mapSize = 0x1000000;
+					}
+					break;
+				case exeMem_ARM7_WVRAM:
+					if (num == 1)
+						mapStart = 0x6000000, mapSize = 0x1000000;
+					break;
+				}
+
+				for (u32 j = 0; j < mapSize / physSize; j++)
+				{
+					u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]);
+					if (num == 0
+						&& virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+						continue;
+					if (invalidate)
+					{
+						if (num == 0)
+							MemoryStatus9[virtAddr / 512] |= 0x80;
+						else
+							MemoryStatus7[virtAddr / 512] |= 0x80;
+					}
+					else
+					{
+						if (num == 0)
+							MemoryStatus9[virtAddr / 512] &= ~0x80;
+						else
+							MemoryStatus7[virtAddr / 512] &= ~0x80;
+					}
+				}
+				
+			}
+			return;
+		}
+	}
+
+	assert(false);
+}
+
+template <typename T>
+T SlowRead9(ARMv5* cpu, u32 addr)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (addr < cpu->ITCMSize)
+		val = *(T*)&cpu->ITCM[addr & 0x7FFF];
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+		val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF];
+	else if (std::is_same<T, u32>::value)
+		val = NDS::ARM9Read32(addr);
+	else if (std::is_same<T, u16>::value)
+		val = NDS::ARM9Read16(addr);
+	else
+		val = NDS::ARM9Read8(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T>
+void SlowWrite9(ARMv5* cpu, u32 addr, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+    if (addr < cpu->ITCMSize)
+	{
+		InvalidateITCMIfNecessary(addr);
+		*(T*)&cpu->ITCM[addr & 0x7FFF] = val;
+	}
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+	{
+		*(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val;
+	}
+	else if (std::is_same<T, u32>::value)
+	{
+		NDS::ARM9Write32(addr, val);
+	}
+	else if (std::is_same<T, u16>::value)
+	{
+		NDS::ARM9Write16(addr, val);
+	}
+	else
+	{
+		NDS::ARM9Write8(addr, val);
+	}
+}
+
+template void SlowWrite9<u32>(ARMv5*, u32, u32);
+template void SlowWrite9<u16>(ARMv5*, u32, u16);
+template void SlowWrite9<u8>(ARMv5*, u32, u8);
+
+template u32 SlowRead9<u32>(ARMv5*, u32);
+template u16 SlowRead9<u16>(ARMv5*, u32);
+template u8 SlowRead9<u8>(ARMv5*, u32);
+
+template <typename T>
+T SlowRead7(u32 addr)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (std::is_same<T, u32>::value)
+		val = NDS::ARM7Read32(addr);
+	else if (std::is_same<T, u16>::value)
+		val = NDS::ARM7Read16(addr);
+	else
+		val = NDS::ARM7Read8(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T>
+void SlowWrite7(u32 addr, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+	if (std::is_same<T, u32>::value)
+		NDS::ARM7Write32(addr, val);
+	else if (std::is_same<T, u16>::value)
+		NDS::ARM7Write16(addr, val);
+	else
+		NDS::ARM7Write8(addr, val);
+}
+
+template <bool PreInc, bool Write>
+void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		addr += PreInc * 4;
+		if (Write)
+			SlowWrite9<u32>(cpu, addr, data[i]);
+		else
+			data[i] = SlowRead9<u32>(cpu, addr);
+		addr += !PreInc * 4;
+	}
+}
+
+template <bool PreInc, bool Write>
+void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		addr += PreInc * 4;
+		if (Write)
+			SlowWrite7<u32>(addr, data[i]);
+		else
+			data[i] = SlowRead7<u32>(addr);
+		addr += !PreInc * 4;
+	}
+}
+
+template void SlowWrite7<u32>(u32, u32);
+template void SlowWrite7<u16>(u32, u16);
+template void SlowWrite7<u8>(u32, u8);
+
+template u32 SlowRead7<u32>(u32);
+template u16 SlowRead7<u16>(u32);
+template u8 SlowRead7<u8>(u32);
+
+template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num);
 
 template <typename K, typename V, int Size, V InvalidValue>
 struct UnreliableHashTable
@@ -211,31 +540,25 @@ struct UnreliableHashTable
 };
 
 UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
-UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
+UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9;
+UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7;
 
 void Init()
 {
-	for (int i = 0; i < 0x2000; i++)
-	{
-		ExeMemKind kind = JIT_MEM[0][i >> 8];
-		u32 size = ExeMemRegionSizes[kind];
-
-		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
-	}
-	for (int i = 0; i < 0x4000; i++)
-	{
-		ExeMemKind kind = JIT_MEM[1][i >> 9];
-		u32 size = ExeMemRegionSizes[kind];
-
-		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
-	}
-
-	compiler = new Compiler();
+	JITCompiler = new Compiler();
 }
 
 void DeInit()
 {
-	delete compiler;
+	delete JITCompiler;
+}
+
+void Reset()
+{
+	ResetBlockCache();
+
+	UpdateMemoryStatus9(0, 0xFFFFFFFF);
+	UpdateMemoryStatus7(0, 0xFFFFFFFF);
 }
 
 void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
@@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeLiteral(const FetchedInstr& instr, u32& addr)
+bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr)
 {
-	switch (instr.Info.Kind)
+	if (!thumb)
 	{
-	case ARMInstrInfo::ak_STR_IMM:
-	case ARMInstrInfo::ak_STRB_IMM:
-		addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
-		return true;
-	case ARMInstrInfo::ak_STRD_IMM:
-	case ARMInstrInfo::ak_STRH_IMM:
-		addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
-		return true;
-	case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever}
-		addr = instr.Addr + 8;
+		switch (instr.Info.Kind)
+		{
+		case ARMInstrInfo::ak_LDR_IMM:
+		case ARMInstrInfo::ak_LDRB_IMM:
+			addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		case ARMInstrInfo::ak_LDRH_IMM:
+			addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		default:
+			break;
+		}
+	}
+	else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL)
+	{
+    	addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2);
 		return true;
-	default:
-		JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr);
-		return false;
 	}
+
+	JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind);
+	return false;
 }
 
 bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
@@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 };
 #undef F
 
+
+extern u32 literalsPerBlock;
 void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu)
 		Config::JIT_MaxBlockSize = 32;
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
-    if (!(cpu->Num == 0 
-        ? IsMapped<0>(blockAddr)
-        : IsMapped<1>(blockAddr)))
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr9(blockAddr)
+			: TranslateAddr7(blockAddr);
+    if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped])
     {
         printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
     }
 	
-	u32 pseudoPhysicalAddr = cpu->Num == 0
-			? TranslateAddr<0>(blockAddr)
-			: TranslateAddr<1>(blockAddr);
-
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
     u32 r15 = cpu->R[15];
 
-	u32 addresseRanges[32] = {};
+	u32 addressRanges[Config::JIT_MaxBlockSize];
+	u32 addressMasks[Config::JIT_MaxBlockSize] = {0};
 	u32 numAddressRanges = 0;
 
+	u32 numLiterals = 0;
+	u32 literalLoadAddrs[Config::JIT_MaxBlockSize];
+	// they are going to be hashed
+	u32 literalValues[Config::JIT_MaxBlockSize];
+	u32 instrValues[Config::JIT_MaxBlockSize];
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
-		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
+	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr);
 
 	u32 lastSegmentStart = blockAddr;
 	u32 lr;
@@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu)
 		nextInstrAddr[1] = r15;
 		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
 
-		u32 translatedAddr = (cpu->Num == 0
-			? TranslateAddr<0>(instrs[i].Addr)
-			: TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF;
-		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		instrValues[i] = instrs[i].Instr;
+
+		u32 translatedAddr = cpu->Num == 0
+			? TranslateAddr9(instrs[i].Addr)
+			: TranslateAddr7(instrs[i].Addr);
+		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
 			bool returning = false;
 			for (int j = 0; j < numAddressRanges; j++)
 			{
-				if (addresseRanges[j] == translatedAddr)
+				if (addressRanges[j] == translatedAddrRounded)
 				{
+					std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]);
+					std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]);
 					returning = true;
 					break;
 				}
 			}
 			if (!returning)
-				addresseRanges[numAddressRanges++] = translatedAddr;
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
 		}
+		addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16);
 
         if (cpu->Num == 0)
         {
@@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu)
                 u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
 				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode]
 					|| instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM
-					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop);
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_UNK);
 				if (cpu->CheckCondition(instrs[i].Cond()))
 					InterpretARM[instrs[i].Info.Kind](cpu);
 				else
@@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu)
 		instrs[i].DataCycles = cpu->DataCycles;
 		instrs[i].DataRegion = cpu->DataRegion;
 
-		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem 
-			&& instrs[i].Info.SrcRegs == (1 << 15)
-			&& instrs[i].Info.DstRegs == 0)
+		u32 literalAddr;
+		if (Config::JIT_LiteralOptimisations
+			&& instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral
+			&& DecodeLiteral(thumb, instrs[i], literalAddr))
 		{
-			assert (!thumb);
-
-			u32 addr;
-			if (DecodeLiteral(instrs[i], addr))
-			{
-				JIT_DEBUGPRINT("pc relative write detected\n");
-				u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
-
-				ARMJIT::InvalidateByAddr(translatedAddr, false);
-				CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16));
-			}
+			u32 translatedAddr = cpu->Num == 0
+				? TranslateAddr9(literalAddr)
+				: TranslateAddr7(literalAddr);
+			u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+
+			u32 j = 0;
+			for (; j < numAddressRanges; j++)
+				if (addressRanges[j] == translatedAddrRounded)
+					break;
+			if (j == numAddressRanges)
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
+			addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16);
+			JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]);
+			cpu->DataRead32(literalAddr, &literalValues[numLiterals]);
+			literalLoadAddrs[numLiterals++] = translatedAddr;
 		}
 
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
@@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu)
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
-						? TranslateAddr<0>(target)
-						: TranslateAddr<1>(target);
+						? TranslateAddr9(target)
+						: TranslateAddr7(target);
 
 					if (link)
 					{
@@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu)
 
         i++;
 
-		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
+		bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
+	u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4);
+	u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4);
+
 	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
 	if (prevBlock)
 	{
 		RestoreCandidates.Remove(pseudoPhysicalAddr);
-		if (prevBlock->NumInstrs == i)
-		{
-			for (int j = 0; j < i; j++)
-			{
-				if (prevBlock->Instrs()[j] != instrs[j].Instr)
-				{
-					mayRestore = false;
-					break;
-				}
-			}
-		}
-		else
-			mayRestore = false;
 
-		if (prevBlock->NumAddresses == numAddressRanges)
+		mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash;
+
+		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
 			for (int j = 0; j < numAddressRanges; j++)
 			{
-				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				if (prevBlock->AddressRanges()[j] != addressRanges[j]
+					|| prevBlock->AddressMasks()[j] != addressMasks[j])
 				{
 					mayRestore = false;
 					break;
@@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu)
 		if (prevBlock)
 			delete prevBlock;
 
-		block = new JitBlock(i, numAddressRanges);
-		for (int j = 0; j < i; j++)
-			block->Instrs()[j] = instrs[j].Instr;
+		block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals);
+		block->LiteralHash = literalHash;
+		block->InstrHash = instrHash;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addressRanges[j];
 		for (int j = 0; j < numAddressRanges; j++)
-			block->AddressRanges()[j] = addresseRanges[j];
+			block->AddressMasks()[j] = addressMasks[j];
+		for (int j = 0; j < numLiterals; j++)
+			block->Literals()[j] = literalLoadAddrs[j];
 
-		block->StartAddr = blockAddr;
 		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
+		block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu)
 
 	for (int j = 0; j < numAddressRanges; j++)
 	{
-		assert(addresseRanges[j] == block->AddressRanges()[j]);
-		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
+		assert(addressRanges[j] == block->AddressRanges()[j]);
+		assert(addressMasks[j] == block->AddressMasks()[j]);
+		assert(addressMasks[j] != 0);
+		CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j];
+		CodeRanges[addressRanges[j] / 512].Blocks.Add(block);
+
+		UpdateRegionByPseudoPhyiscal(addressRanges[j], true);
 	}
 
-	JitBlocks[pseudoPhysicalAddr] = block;
-	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
+	if (cpu->Num == 0)
+	{
+		JitBlocks9[pseudoPhysicalAddr] = block;
+		FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
+	}
+	else
+	{
+		JitBlocks7[pseudoPhysicalAddr] = block;
+		FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
+	}
 }
 
-void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
+void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
-	int startLength = range->Blocks.Length;
-	for (int i = 0; i < range->Blocks.Length; i++)
+	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
+
+	range->Code = 0;
+	for (int i = 0; i < range->Blocks.Length;)
 	{
-		assert(range->Blocks.Length == startLength);
 		JitBlock* block = range->Blocks[i];
+
+		bool invalidated = false;
+		u32 mask = 0;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF))
+			{
+				mask = block->AddressMasks()[j];
+				invalidated = block->AddressMasks()[j] & mask;
+				break;
+			}
+		}
+		assert(mask);
+		if (!invalidated)
+		{
+			range->Code |= mask;
+			i++;
+			continue;
+		}
+		range->Blocks.Remove(i);
+
+		bool literalInvalidation = false;
+		for (int j = 0; j < block->NumLiterals; j++)
+		{
+			u32 addr = block->Literals()[j];
+			if (addr == pseudoPhysical)
+			{
+				if (InvalidLiterals.Find(pseudoPhysical) != -1)
+				{
+					InvalidLiterals.Add(pseudoPhysical);
+					JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length);
+				}
+				literalInvalidation = true;
+				break;
+			}
+		}
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 				assert(otherRange != range);
 				bool removed = otherRange->Blocks.RemoveByValue(block);
 				assert(removed);
+
+				if (otherRange->Blocks.Length == 0)
+				{
+					otherRange->Code = 0;
+					UpdateRegionByPseudoPhyiscal(addr, false);
+				}
 			}
 		}
 
 		for (int j = 0; j < block->NumLinks(); j++)
-			compiler->UnlinkBlock(block->Links()[j]);
+			JITCompiler->UnlinkBlock(block->Links()[j]);
+		block->ResetLinks();
 
-		JitBlocks.erase(block->PseudoPhysicalAddr);
-		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+		if (block->Num == 0)
+		{
+			JitBlocks9.erase(block->PseudoPhysicalAddr);
+			FastBlockLookUp9.Remove(block->PseudoPhysicalAddr);
+		}
+		else
+		{
+			JitBlocks7.erase(block->PseudoPhysicalAddr);
+			FastBlockLookUp7.Remove(block->PseudoPhysicalAddr);
+		}
 
-		if (mayRestore)
+		if (!literalInvalidation)
 		{
 			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
 			if (prevBlock)
 				delete prevBlock;
 		}
+		else
+		{
+			delete block;
+		}
 	}
-	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
-		range->TimesInvalidated++;
-	
-	range->Blocks.Clear();
-}
 
-void InvalidateByAddr7(u32 addr)
-{
-	u32 pseudoPhysical = TranslateAddr<1>(addr);
-	if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false))
-		InvalidateByAddr(pseudoPhysical);
+	if (range->Blocks.Length == 0)
+		UpdateRegionByPseudoPhyiscal(pseudoPhysical, false);
 }
 
-void InvalidateITCM(u32 addr)
+void InvalidateRegionIfNecessary(u32 pseudoPhyisical)
 {
-	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
-	if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0)
-		InvalidateByAddr(pseudoPhysical);
-}
-
-void InvalidateAll()
-{
-	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size());
-	for (auto it : JitBlocks)
-	{
-		JitBlock* block = it.second;
-
-		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
-
-		for (int i = 0; i < block->NumAddresses; i++)
-		{
-			u32 addr = block->AddressRanges()[i];
-			AddressRange* range = &CodeRanges[addr / 512];
-			range->Blocks.Clear();
-			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
-				range->TimesInvalidated++;
-		}
-		for (int i = 0; i < block->NumLinks(); i++)
-			compiler->UnlinkBlock(block->Links()[i]);
-		block->ResetLinks();
-
-		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
-		if (prevBlock)
-			delete prevBlock;
-	}
-
-	JitBlocks.clear();
+	if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16)))
+		InvalidateByAddr(pseudoPhyisical);
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
 
-	FastBlockLookUp.Reset();
+	InvalidLiterals.Clear();
+	FastBlockLookUp9.Reset();
+	FastBlockLookUp7.Reset();
 	RestoreCandidates.Reset();
 	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
@@ -870,61 +1244,119 @@ void ResetBlockCache()
 			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (auto it : JitBlocks)
+	for (auto it : JitBlocks9)
 	{
 		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
 			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].TimesInvalidated = 0;
-			CodeRanges[addr / 512].InvalidLiterals = 0;
+			CodeRanges[addr / 512].Code = 0;
 		}
 		delete block;
 	}
-	JitBlocks.clear();
+	for (auto it : JitBlocks7)
+	{
+		JitBlock* block = it.second;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 512].Blocks.Clear();
+			CodeRanges[addr / 512].Code = 0;
+		}
+	}
+	JitBlocks9.clear();
+	JitBlocks7.clear();
 
-	compiler->Reset();
+	JITCompiler->Reset();
 }
 
+template <u32 Num>
 JitBlockEntry LookUpBlockEntry(u32 addr)
 {
-	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7;
+	u32 entryOffset = fastMap.LookUp(addr);
 	if (entryOffset != UINT32_MAX)
-		return compiler->AddEntryOffset(entryOffset);
+		return JITCompiler->AddEntryOffset(entryOffset);
 
-	auto block = JitBlocks.find(addr);
-	if (block != JitBlocks.end())
+	auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7;
+	auto block = slowMap.find(addr);
+	if (block != slowMap.end())
 	{
-		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint));
 		return block->second->EntryPoint;
 	}
 	return NULL;
 }
 
+template JitBlockEntry LookUpBlockEntry<0>(u32);
+template JitBlockEntry LookUpBlockEntry<1>(u32);
+
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset)
 {
-	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
-	auto block = JitBlocks.find(targetPseudoPhys);
-	if (block == JitBlocks.end())
+	auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7;
+	u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4);
+	u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr);
+	auto block = blockMap.find(targetPseudoPhys);
+	if (block == blockMap.end())
 	{
 		CompileBlock(cpu);
-		block = JitBlocks.find(targetPseudoPhys);
+		block = blockMap.find(targetPseudoPhys);
 	}
 
 	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
 
 	block->second->AddLink(codeOffset);
-	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+	JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
+template void LinkBlock<0>(ARM*, u32);
+template void LinkBlock<1>(ARM*, u32);
+
+void WifiWrite32(u32 addr, u32 val)
+{
+	Wifi::Write(addr, val & 0xFFFF);
+	Wifi::Write(addr + 2, val >> 16);
+}
+
+u32 WifiRead32(u32 addr)
+{
+	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
+}
+
+template <typename T>
+void VRAMWrite(u32 addr, T val)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
+	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
+	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
+	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
+	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
+	}
+}
+template <typename T>
+T VRAMRead(u32 addr)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
+	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
+	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
+	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
+	default: return GPU::ReadVRAM_LCDC<T>(addr);
+	}
 }
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
 	{
-		if ((addr & 0xFF000000) == 0x04000000)
+		switch (addr & 0xFF000000)
 		{
+		case 0x04000000:
 			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
 				return (void*)NDSCart::ReadROMData;
 
@@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 
 			switch (size | store)
 			{
-			case 8: return (void*)NDS::ARM9IORead8;		
-			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 8: return (void*)NDS::ARM9IORead8;
+			case 9: return (void*)NDS::ARM9IOWrite8;
 			case 16: return (void*)NDS::ARM9IORead16;
 			case 17: return (void*)NDS::ARM9IOWrite16;
 			case 32: return (void*)NDS::ARM9IORead32;
 			case 33: return (void*)NDS::ARM9IOWrite32;
 			}
+			break;
+		case 0x06000000:
+			switch (size | store)
+			{
+			case 8: return (void*)VRAMRead<u8>;		
+			case 9: return NULL;
+			case 16: return (void*)VRAMRead<u16>;
+			case 17: return (void*)VRAMWrite<u16>;
+			case 32: return (void*)VRAMRead<u32>;
+			case 33: return (void*)VRAMWrite<u32>;
+			}
+			break;
 		}
 	}
 	else
@@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 			}
 			break;
 		case 0x04800000:
-			if (addr < 0x04810000 && size == 16)
+			if (addr < 0x04810000 && size >= 16)
 			{
-				if (store)
-					return (void*)Wifi::Write;
-				else
-					return (void*)Wifi::Read;
+				switch (size | store)
+				{
+				case 16: return (void*)Wifi::Read;
+				case 17: return (void*)Wifi::Write;
+				case 32: return (void*)WifiRead32;
+				case 33: return (void*)WifiWrite32;
+				}
 			}
 			break;
+		case 0x06000000:
+		case 0x06800000:
+			switch (size | store)
+			{
+			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
+			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
+			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
+			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
+			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
+			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
+			}
 		}
 	}
 	return NULL;
 }
 
 }
-
-template void ARMJIT::LinkBlock<0>(ARM*, u32);
-template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index cab385f..44a6140 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -28,45 +28,60 @@ extern const u32 ExeMemRegionSizes[];
 
 typedef u32 (*JitBlockEntry)();
 
-extern u32 AddrTranslate9[0x2000];
-extern u32 AddrTranslate7[0x4000];
-
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
 
-template <u32 num>
-inline bool IsMapped(u32 addr)
-{
-	if (num == 0)
-		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
-	else
-		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
-}
-
-template <u32 num>
-inline u32 TranslateAddr(u32 addr)
-{
-	if (num == 0)
-		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
-	else
-		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
-}
+u32 TranslateAddr9(u32 addr);
+u32 TranslateAddr7(u32 addr);
 
+template <u32 Num>
 JitBlockEntry LookUpBlockEntry(u32 addr);
 
-
 void Init();
 void DeInit();
 
-void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true);
-void InvalidateAll();
+void Reset();
+
+void InvalidateByAddr(u32 pseudoPhysical);
+
+void InvalidateRegionIfNecessary(u32 addr);
 
-void InvalidateITCM(u32 addr);
-void InvalidateByAddr7(u32 addr);
+inline void InvalidateMainRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)));
+}
+inline void InvalidateITCMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF));
+}
+inline void InvalidateLCDCIfNecessary(u32 addr)
+{
+	if (addr < 0x68A3FFF)
+		InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000));
+}
+inline void InvalidateSWRAM7IfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask));
+}
+inline void InvalidateSWRAM9IfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask));
+}
+inline void InvalidateARM7WRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF));
+}
+inline void InvalidateARM7WVRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF));
+}
 
 void CompileBlock(ARM* cpu);
 
 void ResetBlockCache();
 
+void UpdateMemoryStatus9(u32 start, u32 end);
+void UpdateMemoryStatus7(u32 start, u32 end);
+
 }
 
 extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 00fa436..a67f357 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02)
+        if ((CurInstr.DataRegion >> 24) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 66d1808..4e45760 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -152,30 +152,34 @@ struct __attribute__((packed)) TinyVector
 class JitBlock
 {
 public:
-	JitBlock(u32 numInstrs, u32 numAddresses)
+	JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals)
 	{
-		NumInstrs = numInstrs;
+		Num = num;
 		NumAddresses = numAddresses;
-		Data.SetLength(numInstrs + numAddresses);
+		NumLiterals = numLiterals;
+		Data.SetLength(numAddresses * 2 + numLiterals);
 	}
 
-	u32 StartAddr;
 	u32 PseudoPhysicalAddr;
-	
-	u32 NumInstrs;
-	u32 NumAddresses;
+
+	u32 InstrHash, LiteralHash;
+	u8 Num;
+	u16 NumAddresses;
+	u16 NumLiterals;
 
 	JitBlockEntry EntryPoint;
 
-	u32* Instrs()
-	{ return &Data[0]; }
 	u32* AddressRanges()
-	{ return &Data[NumInstrs]; }
+	{ return &Data[0]; }
+	u32* AddressMasks()
+	{ return &Data[NumAddresses]; }
+	u32* Literals()
+	{ return &Data[NumAddresses * 2]; }
 	u32* Links()
-	{ return &Data[NumInstrs + NumAddresses]; }
+	{ return &Data[NumAddresses * 2 + NumLiterals]; }
 
 	u32 NumLinks()
-	{ return Data.Length - NumInstrs - NumAddresses; }
+	{ return Data.Length - NumAddresses * 2 - NumLiterals; }
 
 	void AddLink(u32 link)
 	{
@@ -184,7 +188,7 @@ public:
 
 	void ResetLinks()
 	{
-		Data.SetLength(NumInstrs + NumAddresses);
+		Data.SetLength(NumAddresses * 2 + NumLiterals);
 	}
 
 private:
@@ -200,8 +204,7 @@ private:
 struct __attribute__((packed)) AddressRange
 {
 	TinyVector<JitBlock*> Blocks;
-	u16 InvalidLiterals;
-	u16 TimesInvalidated;
+	u32 Code;
 };
 
 extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
@@ -210,14 +213,45 @@ typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
-extern u8 MemRegion9[0x80000];
-extern u8 MemRegion7[0x80000];
+extern u8 MemoryStatus9[0x800000];
+extern u8 MemoryStatus7[0x800000];
+
+extern TinyVector<u32> InvalidLiterals;
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset);
 
+enum
+{
+	memregion_Other = 0,
+	memregion_ITCM,
+	memregion_DTCM,
+	memregion_BIOS9,
+	memregion_MainRAM,
+	memregion_SWRAM9,
+	memregion_SWRAM7,
+	memregion_IO9,
+	memregion_VRAM,
+	memregion_BIOS7,
+	memregion_WRAM7,
+	memregion_IO7,
+	memregion_Wifi,
+	memregion_VWRAM,
+};
+
+int ClassifyAddress9(u32 addr);
+int ClassifyAddress7(u32 addr);
+
+template <typename T> T SlowRead9(ARMv5* cpu, u32 addr);
+template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val);
+template <typename T> T SlowRead7(u32 addr);
+template <typename T> void SlowWrite7(u32 addr, T val);
+
+template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
+template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 5e18e84..0547c84 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -95,20 +95,6 @@ public:
         LiteralsLoaded = 0;
     }
 
-    BitSet32 GetPushRegs()
-    {
-        BitSet16 used;
-        for (int i = 0; i < InstrsCount; i++)
-            used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs);
-
-        BitSet32 res;
-        u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable);
-        for (int i = 0; i < registersMax; i++)
-            res |= BitSet32(1 << (int)NativeRegAllocOrder[i]);
-
-        return res;
-    }
-
 	void Prepare(bool thumb, int i)
     {
         FetchedInstr instr = Instrs[i];
@@ -139,7 +125,6 @@ public:
             UnloadRegister(reg);
 
         u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
-        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -182,13 +167,12 @@ public:
                     if (left-- == 0)
                         break;
 
-                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
                     LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
                 }
             }
         }
 
-        DirtyRegs |= writeRegs & ~(1 << 15);
+        DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index dd20e3c..eee2e0f 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -195,26 +195,6 @@ Compiler::Compiler()
 
     Reset();
 
-    for (int i = 0; i < 3; i++)
-    {
-        for (int j = 0; j < 2; j++)
-            MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-    }
-    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
-    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
-    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
-    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
-    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
-    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
-
-    for (int i = 0; i < 2; i++)
-        for (int j = 0; j < 2; j++)
-        {
-            MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j);
-            MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false);
-            MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
-        }
-
     {
         // RSCRATCH mode
         // RSCRATCH2 reg number
@@ -317,6 +297,12 @@ Compiler::Compiler()
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
+
+    NearStart = ResetStart;
+    FarStart = ResetStart + 1024*1024*24;
+
+    NearSize = FarStart - ResetStart;
+    FarSize = (ResetStart + CodeMemSize) - FarStart;
 }
 
 void Compiler::LoadCPSR()
@@ -504,6 +490,9 @@ void Compiler::Reset()
 {
     memset(ResetStart, 0xcc, CodeMemSize);
     SetCodePtr(ResetStart);
+
+    NearCode = NearStart;
+    FarCode = FarStart;
 }
 
 void Compiler::Comp_SpecialBranchBehaviour(bool taken)
@@ -544,8 +533,16 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 
 JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
-    if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
+    if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess...
+    {
+        printf("near reset\n");
+        ResetBlockCache();
+    }
+    if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess...
+    {
+        printf("far reset\n");
         ResetBlockCache();
+    }
 
     ConstantCycles = 0;
     Thumb = thumb;
@@ -762,12 +759,14 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
+        IrregularCycles = true;
+
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index e0a4978..9df218b 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -140,7 +140,7 @@ public:
     };
     void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
-    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
+    bool Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -154,12 +154,6 @@ public:
 
     void Comp_SpecialBranchBehaviour(bool taken);
 
-    void* Gen_MemoryRoutine9(bool store, int size);
-
-    void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
-    void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
-
-    void* Gen_ChangeCPSRRoutine();
 
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
@@ -193,6 +187,26 @@ public:
         return (u8*)entry - ResetStart;
     }
 
+    void SwitchToNearCode()
+    {
+        FarCode = GetWritableCodePtr();
+        SetCodePtr(NearCode);
+    }
+
+    void SwitchToFarCode()
+    {
+        NearCode = GetWritableCodePtr();
+        SetCodePtr(FarCode);
+    }
+
+    u8* FarCode;
+    u8* NearCode;
+    u32 FarSize;
+    u32 NearSize;
+
+    u8* NearStart;
+    u8* FarStart;
+
     u8* ResetStart;
     u32 CodeMemSize;
 
@@ -201,12 +215,6 @@ public:
 
     void* BranchStub[2];
 
-    void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2];
-
-    void* MemoryFuncsSeq9[2][2];
-    void* MemoryFuncsSeq7[2][2][2];
-
     void* ReadBanked;
     void* WriteBanked;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b595e32..c13b779 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -25,236 +25,17 @@ int squeezePointer(T* ptr)
     improvement.
 */
 
-/*
-    address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
-    store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-*/
-void* Compiler::Gen_MemoryRoutine9(bool store, int size)
+bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
 {
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-    FixupBranch insideDTCM = J_CC(CC_B);
-
-    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-    FixupBranch insideITCM = J_CC(CC_B);
-
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM9Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM9Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM9Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            // everything's already in the appropriate register
-            ABI_CallFunction(NDS::ARM9Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM9Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM9Read8, true);
-    }
-
-    SetJumpTarget(insideDTCM);
-    AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
-    if (store)
-        MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
+    u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
 
-    SetJumpTarget(insideITCM);
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
-    AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
-    if (store)
-    {
-        MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        
-        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
-        static_assert(sizeof(AddressRange) == 16);
-        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
-        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-        SHR(32, R(RSCRATCH), Imm8(9));
-        SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
-        FixupBranch noCode = J_CC(CC_Z);
-        JMP((u8*)InvalidateByAddr, true);
-        SetJumpTarget(noCode);
-    }
-    else
+    int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
+    if (invalidLiteralIdx != -1)
     {
-        MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
     }
-    RET();
-
-    static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!");
-
-    return res;
-}
-
-#define MEMORY_SEQ_WHILE_COND \
-        if (!store) \
-            MOV(32, currentElement, R(EAX));\
-        if (!preinc) \
-            ADD(32, R(ABI_PARAM1), Imm8(4)); \
-        \
-        SUB(32, R(ABI_PARAM3), Imm8(1)); \
-        J_CC(CC_NZ, repeat);
-
-/*
-    ABI_PARAM1 address
-    ABI_PARAM2 address where registers are stored
-    ABI_PARAM3 how many values to read/write
-
-    Dolphin x64CodeEmitter is my favourite assembler
- */
-void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    const u8* repeat = GetCodePtr();
-
-    if (preinc)
-        ADD(32, R(ABI_PARAM1), Imm8(4));
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-    FixupBranch insideDTCM = J_CC(CC_B);
-
-    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-    FixupBranch insideITCM = J_CC(CC_B);
-
-    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster
-
-    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-    AND(32, R(ABI_PARAM1), Imm8(~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM2), currentElement);
-        CALL((void*)NDS::ARM9Write32);
-    }
-    else
-        CALL((void*)NDS::ARM9Read32);
-    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    SetJumpTarget(insideDTCM);
-    AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM4), currentElement);
-        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4));
-    }
-    else
-        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    SetJumpTarget(insideITCM);
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM4), currentElement);
-        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-
-        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
-        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
-        SHR(32, R(RSCRATCH), Imm8(9));
-        SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
-        FixupBranch noCode = J_CC(CC_Z);
-        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
-        CALL((u8*)InvalidateByAddr);
-        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-        SetJumpTarget(noCode);
-    }
-    else
-        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    return res;
-}
-
-void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    const u8* repeat = GetCodePtr();
-
-    if (preinc)
-        ADD(32, R(ABI_PARAM1), Imm8(4));
-
-    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8);
-
-    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-    AND(32, R(ABI_PARAM1), Imm8(~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM2), currentElement);
-        CALL((void*)NDS::ARM7Write32);
-    }
-    else
-        CALL((void*)NDS::ARM7Read32);
-    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    return res;
-}
-
-#undef MEMORY_SEQ_WHILE_COND
-
-void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
-{
     u32 val;
     // make sure arm7 bios is accessible
     u32 tmpR15 = CurCPU->R[15];
@@ -276,12 +57,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
         RegCache.PutLiteral(rd, val);
 
     Comp_AddCycles_CDI();
+
+    return true;
 }
 
-/*void fault(u32 a, u32 b, u32 c, u32 d)
-{
-    printf("actually not static! %x %x %x %x\n", a, b, c, d);
-}*/
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
@@ -291,17 +70,12 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    //bool check = false;
     if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
-
-        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
-        {
-            Comp_MemLoadLiteral(size, rd, addr);
+        
+        if (Comp_MemLoadLiteral(size, rd, addr))
             return;
-        }
     }
 
     {
@@ -314,173 +88,334 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             Comp_AddCycles_CDI();
         }
 
+        bool addrIsStatic = Config::JIT_LiteralOptimisations
+            && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
+        u32 staticAddress;
+        if (addrIsStatic)
+            staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         OpArg rdMapped = MapReg(rd);
-        OpArg rnMapped = MapReg(rn);
-        if (Thumb && rn == 15)
-            rnMapped = Imm32(R15 & ~0x2);
-
-        bool inlinePreparation = Num == 1;
-        u32 constLocalROR32 = 4;
-
-        void* memoryFunc = Num == 0
-            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
-            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (!addrIsStatic)
         {
-            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-
-            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
-            MOV(32, R(ABI_PARAM1), Imm32(R15));
-            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
-            CMP(32, R(RSCRATCH), Imm32(addr));
-            FixupBranch eq = J_CC(CC_E);
-            CALL((void*)fault);
-            SetJumpTarget(eq);*/
-
-            NDS::MemRegion region;
-            region.Mem = NULL;
-            if (Num == 0)
+            OpArg rnMapped = MapReg(rn);
+            if (Thumb && rn == 15)
+                rnMapped = Imm32(R15 & ~0x2);
+
+            X64Reg finalAddr = RSCRATCH3;
+            if (flags & memop_Post)
             {
-                ARMv5* cpu5 = (ARMv5*)CurCPU;
+                MOV(32, R(RSCRATCH3), rnMapped);
 
-                // stupid dtcm...
-                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
-                {
-                    // disable this for now as DTCM is located in heap
-                    // which might excced the RIP-addressable range
-                    //region.Mem = cpu5->DTCM;
-                    //region.Mask = 0x3FFF;
-                }
-                else
-                {
-                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
-                }
+                finalAddr = rnMapped.GetSimpleReg();
             }
-            else
-                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
 
-            if (region.Mem != NULL)
+            if (op2.IsImm)
+            {
+                MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            }
+            else
             {
-                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+                OpArg rm = MapReg(op2.Reg.Reg);
 
-                if (flags & memop_Store)
+                if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                    && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
                 {
-                    MOV(size, M(ptr), MapReg(rd));
+                    LEA(32, finalAddr, 
+                        MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
                 }
                 else
                 {
-                    if (flags & memop_SignExtend)
-                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
-                    else
-                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    bool throwAway;
+                    OpArg offset =
+                        Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
 
-                    if (size == 32 && addr & ~0x3)
+                    if (flags & memop_SubtractOffset)
                     {
-                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                        if (R(finalAddr) != rnMapped)
+                            MOV(32, R(finalAddr), rnMapped);
+                        if (!offset.IsZero())
+                            SUB(32, R(finalAddr), offset);
                     }
+                    else
+                        MOV_sum(32, finalAddr, rnMapped, offset);
                 }
-
-                return;
             }
 
-            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
-            if (specialFunc)
-            {
-                memoryFunc = specialFunc;
-                inlinePreparation = true;
-                constLocalROR32 = addr & 0x3;
-            }
+            if ((flags & memop_Writeback) && !(flags & memop_Post))
+                MOV(32, rnMapped, R(finalAddr));
         }
 
-        X64Reg finalAddr = ABI_PARAM1;
-        if (flags & memop_Post)
-        {
-            MOV(32, R(ABI_PARAM1), rnMapped);
+        int expectedTarget = Num == 0
+            ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
+            : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
+        if (CurInstr.Cond() < 0xE)
+            expectedTarget = memregion_Other;
+
+        bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store);
 
-            finalAddr = rnMapped.GetSimpleReg();
+        switch (expectedTarget)
+        {
+        case memregion_MainRAM:
+        case memregion_DTCM:
+        case memregion_WRAM7:
+        case memregion_SWRAM9:
+        case memregion_SWRAM7:
+        case memregion_IO9:
+        case memregion_IO7:
+        case memregion_VWRAM:
+            compileFastPath = true;
+            break;
+        case memregion_Wifi:
+            compileFastPath = size >= 16;
+            break;
+        case memregion_VRAM:
+            compileFastPath = !(flags & memop_Store) || size >= 16;
+        case memregion_BIOS9:
+            compileFastPath = !(flags & memop_Store);
+            break;
+        default: break;
         }
 
-        if (op2.IsImm)
+        if (addrIsStatic && !compileFastPath)
         {
-            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            compileFastPath = false;
+            compileSlowPath = true;
         }
-        else
+
+        if (addrIsStatic && compileSlowPath)
+            MOV(32, R(RSCRATCH3), Imm32(staticAddress));
+
+        if (compileFastPath)
         {
-            OpArg rm = MapReg(op2.Reg.Reg);
+            FixupBranch slowPath;
+            if (compileSlowPath)
+            {
+                MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                SHR(32, R(RSCRATCH), Imm8(9));
+                if (flags & memop_Store)
+                {
+                    CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
+                }
+                else
+                {
+                    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
+                    AND(32, R(RSCRATCH), Imm8(~0x80));
+                    CMP(32, R(RSCRATCH), Imm8(expectedTarget));
+                }
+
+                slowPath = J_CC(CC_NE, true);
+            }
 
-            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
-                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7
+                || expectedTarget == memregion_BIOS9)
             {
-                LEA(32, finalAddr, 
-                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+                u8* data;
+                u32 mask;
+                if (expectedTarget == memregion_MainRAM)
+                {
+                    data = NDS::MainRAM;
+                    mask = MAIN_RAM_SIZE - 1;
+                }
+                else if (expectedTarget == memregion_BIOS9)
+                {
+                    data = NDS::ARM9BIOS;
+                    mask = 0xFFF;
+                }
+                else
+                {
+                    data = NDS::ARM7WRAM;
+                    mask = 0xFFFF;
+                }
+                OpArg memLoc;
+                if (addrIsStatic)
+                {
+                    memLoc = M(data + ((staticAddress & mask & addressMask)));
+                }
+                else
+                {
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                    AND(32, R(RSCRATCH), Imm32(mask & addressMask));
+                    memLoc = MDisp(RSCRATCH, squeezePointer(data));
+                }
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
             }
-            else
+            else if (expectedTarget == memregion_DTCM)
+            {
+                if (addrIsStatic)
+                    MOV(32, R(RSCRATCH), Imm32(staticAddress));
+                else
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+                AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
+                OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM));
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
+            }
+            else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7)
             {
-                bool throwAway;
-                OpArg offset =
-                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
-                
-                if (flags & memop_SubtractOffset)
+                MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
+                if (addrIsStatic)
                 {
-                    if (R(finalAddr) != rnMapped)
-                        MOV(32, R(finalAddr), rnMapped);
-                    if (!offset.IsZero())
-                        SUB(32, R(finalAddr), offset);
+                    MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask));
                 }
                 else
-                    MOV_sum(32, finalAddr, rnMapped, offset);
+                {
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                    AND(32, R(RSCRATCH), Imm8(addressMask));
+                }
+                AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
+                OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2);
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
             }
-        }
+            else
+            {
+                u32 maskedDataRegion;
 
-        if ((flags & memop_Writeback) && !(flags & memop_Post))
-            MOV(32, rnMapped, R(finalAddr));
+                if (addrIsStatic)
+                {
+                    maskedDataRegion = staticAddress;
+                    MOV(32, R(ABI_PARAM1), Imm32(staticAddress));
+                }
+                else
+                {
+                    if (ABI_PARAM1 != RSCRATCH3)
+                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                    AND(32, R(ABI_PARAM1), Imm8(addressMask));
 
-        if (flags & memop_Store)
-            MOV(32, R(ABI_PARAM2), rdMapped);
+                    maskedDataRegion = CurInstr.DataRegion;
+                    if (Num == 0)
+                        maskedDataRegion &= ~0xFFFFFF;
+                    else
+                        maskedDataRegion &= ~0x7FFFFF;
+                }
 
-        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
-            MOV(32, rdMapped, R(ABI_PARAM1));
+                void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size);
 
-        if (inlinePreparation && size > 8)
-            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM2), rdMapped);
 
-        CALL(memoryFunc);
+                    ABI_CallFunction((void(*)())func);
+                }
+                else
+                {
+                    if (!addrIsStatic)
+                        MOV(32, rdMapped, R(RSCRATCH3));
 
-        /*if (Num == 0 && check)
-        {
-            CMP(32, R(EAX), rdMapped);
-            FixupBranch notEqual = J_CC(CC_E);
-            ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0);
-            MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8)));
-            MOV(32, R(ABI_PARAM2), R(EAX));
-            MOV(32, R(ABI_PARAM3), rdMapped);
-            MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr));
-            CALL((u8*)fault);
-            ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0);
-            SetJumpTarget(notEqual);
-        }*/
-
-        if (!(flags & memop_Store))
-        {
-            if (inlinePreparation && size == 32)
+                    ABI_CallFunction((void(*)())func);
+
+                    if (!addrIsStatic)
+                        MOV(32, R(RSCRATCH3), rdMapped);
+
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                }
+            }
+
+            if ((size == 32 && !(flags & memop_Store)))
             {
-                if (constLocalROR32 == 4)
+                if (addrIsStatic)
+                {
+                    if (staticAddress & 0x3)
+                        ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8));
+                }
+                else
                 {
-                    static_assert(RSCRATCH3 == ECX);
-                    MOV(32, R(ECX), rdMapped);
-                    AND(32, R(ECX), Imm8(3));
-                    SHL(32, R(ECX), Imm8(3));
-                    ROR_(32, R(RSCRATCH), R(ECX));
+                    AND(32, R(RSCRATCH3), Imm8(0x3));
+                    SHL(32, R(RSCRATCH3), Imm8(3));
+                    ROR_(32, rdMapped, R(RSCRATCH3));
                 }
-                else if (constLocalROR32 != 0)
-                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
             }
 
-            if (flags & memop_SignExtend)
-                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            if (compileSlowPath)
+            {
+                SwitchToFarCode();
+                SetJumpTarget(slowPath);
+            }
+        }
+
+        if (compileSlowPath)
+        {
+            if (Num == 0)
+            {
+                MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM3), rdMapped);
+
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowWrite9<u32>); break;
+                    case 16: CALL((void*)&SlowWrite9<u16>); break;
+                    case 8: CALL((void*)&SlowWrite9<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowRead9<u32>); break;
+                    case 16: CALL((void*)&SlowRead9<u16>); break;
+                    case 8: CALL((void*)&SlowRead9<u8>); break;
+                    }
+                }
+            }
             else
-                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            {
+                if (ABI_PARAM1 != RSCRATCH3)
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM2), rdMapped);
+
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowWrite7<u32>); break;
+                    case 16: CALL((void*)&SlowWrite7<u16>); break;
+                    case 8: CALL((void*)&SlowWrite7<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowRead7<u32>); break;
+                    case 16: CALL((void*)&SlowRead7<u16>); break;
+                    case 8: CALL((void*)&SlowRead7<u8>); break;
+                    }
+                }
+            }
+            if (!(flags & memop_Store))
+            {
+                if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            }
+        }
+
+        if (compileFastPath && compileSlowPath)
+        {
+            FixupBranch ret = J(true);
+            SwitchToNearCode();
+            SetJumpTarget(ret);
         }
 
         if (!(flags & memop_Store) && rd == 15)
@@ -498,100 +433,160 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
-    IrregularCycles = true;
-
     int regsCount = regs.Count();
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
     // we need to make sure that the stack stays aligned to 16 bytes
+#ifdef _WIN32
+    // include shadow
+    u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8;
+#else
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
+#endif
+    u32 allocOffset = stackAlloc - regsCount * 8;
 
-    if (!store)
+    int expectedTarget = Num == 0
+        ? ClassifyAddress9(CurInstr.DataRegion)
+        : ClassifyAddress7(CurInstr.DataRegion);
+    if (usermode || CurInstr.Cond() < 0xE)
+        expectedTarget = memregion_Other;
+
+    bool compileFastPath = false;
+
+    switch (expectedTarget)
     {
+    case memregion_DTCM:
+    case memregion_MainRAM:
+    case memregion_SWRAM9:
+    case memregion_SWRAM7:
+    case memregion_WRAM7:
+        compileFastPath = true;
+        break;
+    default:
+        break;
+    }
+
+    if (!store)
         Comp_AddCycles_CDI();
+    else
+        Comp_AddCycles_CD();
 
-        if (decrement)
+    if (decrement)
+    {
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4));
+        preinc ^= true;
+    }
+    else
+        MOV(32, R(RSCRATCH4), MapReg(rn));
+
+    if (compileFastPath)
+    {
+        assert(!usermode);
+
+        MOV(32, R(RSCRATCH), R(RSCRATCH4));
+        SHR(32, R(RSCRATCH), Imm8(9));
+
+        if (store)
         {
-            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-            preinc ^= true;
+            CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
         }
         else
-            MOV(32, R(ABI_PARAM1), MapReg(rn));
-
-        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
-        MOV(64, R(ABI_PARAM2), R(RSP));
-
-        CALL(Num == 0
-            ? MemoryFuncsSeq9[0][preinc]
-            : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
+        {
+            MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
+            AND(32, R(RSCRATCH), Imm8(~0x80));
+            CMP(32, R(RSCRATCH), Imm8(expectedTarget));
+        }
+        FixupBranch slowPath = J_CC(CC_NE, true);
 
-        bool firstUserMode = true;
-        for (int reg = 15; reg >= 0; reg--)
+        if (expectedTarget == memregion_DTCM)
         {
-            if (regs[reg])
+            SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+            AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3));
+            LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM)));
+        }
+        else if (expectedTarget == memregion_MainRAM)
+        {
+            AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3));
+            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM)));
+        }
+        else if (expectedTarget == memregion_WRAM7)
+        {
+            AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3));
+            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM)));
+        }
+        else // SWRAM
+        {
+            AND(32, R(RSCRATCH4), Imm8(~3));
+            AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
+            ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
+        }
+        u32 offset = 0;
+        for (int reg : regs)
+        {
+            if (preinc)
+                offset += 4;
+            OpArg mem = MDisp(RSCRATCH4, offset);
+            if (store)
             {
-                if (usermode && !regs[15] && reg >= 8 && reg < 15)
+                if (RegCache.LoadedRegs & (1 << reg))
                 {
-                    if (firstUserMode)
-                    {
-                        MOV(32, R(RSCRATCH), R(RCPSR));
-                        AND(32, R(RSCRATCH), Imm8(0x1F));
-                        firstUserMode = false;
-                    }
-                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
-                    POP(RSCRATCH3);
-                    CALL(WriteBanked);
-                    FixupBranch sucessfulWritten = J_CC(CC_NC);
-                    if (RegCache.Mapping[reg] != INVALID_REG)
-                        MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
-                    else
-                        SaveReg(reg, RSCRATCH3);
-                    SetJumpTarget(sucessfulWritten);
+                    MOV(32, mem, MapReg(reg));
                 }
-                else if (RegCache.Mapping[reg] == INVALID_REG)
+                else
                 {
-                    assert(reg != 15);
-
-                    POP(RSCRATCH);
-                    SaveReg(reg, RSCRATCH);
+                    LoadReg(reg, RSCRATCH);
+                    MOV(32, mem, R(RSCRATCH));
+                }
+            }
+            else
+            {
+                if (RegCache.LoadedRegs & (1 << reg))
+                {
+                    MOV(32, MapReg(reg), mem);
                 }
                 else
                 {
-                    if (reg != 15)
-                        RegCache.DirtyRegs |= (1 << reg);
-                    POP(MapReg(reg).GetSimpleReg());
+                    MOV(32, R(RSCRATCH), mem);
+                    SaveReg(reg, RSCRATCH);
                 }
             }
+            if (!preinc)
+                offset += 4;
         }
 
-        if (regsCount & 1)
-            POP(RSCRATCH);
+        SwitchToFarCode();
+        SetJumpTarget(slowPath);
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+        if (allocOffset == 0)
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        else
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        if (regs[15])
+        switch (Num * 2 | preinc)
         {
-            if (Num == 1)
-            {
-                if (Thumb)
-                    OR(32, MapReg(15), Imm8(1));
-                else
-                    AND(32, MapReg(15), Imm8(0xFE));
-            }
-            Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+        case 0: CALL((void*)&SlowBlockTransfer9<false, false>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, false>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, false>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
         }
-    }
-    else
-    {
-        Comp_AddCycles_CD();
 
-        if (regsCount & 1)
-            PUSH(RSCRATCH);
+        if (allocOffset)
+            ADD(64, R(RSP), Imm8(allocOffset));
 
         bool firstUserMode = true;
         for (int reg : regs)
         {
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 if (firstUserMode)
                 {
@@ -599,43 +594,107 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     AND(32, R(RSCRATCH), Imm8(0x1F));
                     firstUserMode = false;
                 }
-                if (RegCache.Mapping[reg] == INVALID_REG)
-                    LoadReg(reg, RSCRATCH3);
-                else
-                    MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
                 MOV(32, R(RSCRATCH2), Imm32(reg - 8));
-                CALL(ReadBanked);
-                PUSH(RSCRATCH3);
+                POP(RSCRATCH3);
+                CALL(WriteBanked);
+                FixupBranch sucessfulWritten = J_CC(CC_NC);
+                if (RegCache.LoadedRegs & (1 << reg))
+                    MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
+                else
+                    SaveReg(reg, RSCRATCH3);
+                SetJumpTarget(sucessfulWritten);
             }
-            else if (RegCache.Mapping[reg] == INVALID_REG)
+            else if (!(RegCache.LoadedRegs & (1 << reg)))
             {
-                LoadReg(reg, RSCRATCH);
-                PUSH(RSCRATCH);
+                assert(reg != 15);
+
+                POP(RSCRATCH);
+                SaveReg(reg, RSCRATCH);
             }
             else
             {
-                PUSH(MapReg(reg).GetSimpleReg());
+                POP(MapReg(reg).GetSimpleReg());
             }
         }
-
-        if (decrement)
+    }
+    else
+    {
+        bool firstUserMode = true;
+        for (int reg = 15; reg >= 0; reg--)
         {
-            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-            preinc ^= true;
+            if (regs[reg])
+            {
+                if (usermode && reg >= 8 && reg < 15)
+                {
+                    if (firstUserMode)
+                    {
+                        MOV(32, R(RSCRATCH), R(RCPSR));
+                        AND(32, R(RSCRATCH), Imm8(0x1F));
+                        firstUserMode = false;
+                    }
+                    if (RegCache.Mapping[reg] == INVALID_REG)
+                        LoadReg(reg, RSCRATCH3);
+                    else
+                        MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
+                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                    CALL(ReadBanked);
+                    PUSH(RSCRATCH3);
+                }
+                else if (!(RegCache.LoadedRegs & (1 << reg)))
+                {
+                    LoadReg(reg, RSCRATCH);
+                    PUSH(RSCRATCH);
+                }
+                else
+                {
+                    PUSH(MapReg(reg).GetSimpleReg());
+                }
+            }
         }
-        else
-            MOV(32, R(ABI_PARAM1), MapReg(rn));
 
-        MOV(64, R(ABI_PARAM2), R(RSP));
+        if (allocOffset)
+            SUB(64, R(RSP), Imm8(allocOffset));
+
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        if (allocOffset)
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+        else
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        CALL(Num == 0
-            ? MemoryFuncsSeq9[1][preinc]
-            : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
+        switch (Num * 2 | preinc)
+        {
+        case 0: CALL((void*)&SlowBlockTransfer9<false, true>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, true>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, true>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, true>); break;
+        }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     }
 
+    if (compileFastPath)
+    {
+        FixupBranch ret = J(true);
+        SwitchToNearCode();
+        SetJumpTarget(ret);
+    }
+
+    if (!store && regs[15])
+    {
+        if (Num == 1)
+        {
+            if (Thumb)
+                OR(32, MapReg(15), Imm8(1));
+            else
+                AND(32, MapReg(15), Imm8(0xFE));
+        }
+        Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+    }
+
     return offset;
 }
 
@@ -786,9 +845,7 @@ void Compiler::T_Comp_LoadPCRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) << 2;
     u32 addr = (R15 & ~0x2) + offset;
-    if (Config::JIT_LiteralOptimisations)
-        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
-    else
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr))
         Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 28362d9..b50e821 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -373,16 +373,16 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
         {
-            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
-            res.NotStrictlyNeeded |= set;
+            u32 set = (instr & 0xFF);
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
             res.DstRegs |= set;
         }
         if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
         {
-            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            u32 set = (instr & 0xFF);
             if (res.Kind == tk_PUSH && instr & (1 << 8))
                 set |= (1 << 14);
-            res.NotStrictlyNeeded |= set;
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
             res.SrcRegs |= set;
         }
 
@@ -495,15 +495,15 @@ Info Decode(bool thumb, u32 num, u32 instr)
         
         if (res.Kind == ak_LDM)
         {
-            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
             res.DstRegs |= set;
-            res.NotStrictlyNeeded |= set;
         }
         if (res.Kind == ak_STM)
         {
-            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
             res.SrcRegs |= set;
-            res.NotStrictlyNeeded |= set;
         }
 
         if ((instr >> 28) < 0xE)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 62258e9..e665dbd 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -97,6 +97,10 @@ void ARMv5::CP15DoSavestate(Savestate* file)
 
 void ARMv5::UpdateDTCMSetting()
 {
+#ifdef JIT_ENABLED
+    u32 oldDTCMBase = DTCMBase;
+    u32 oldDTCMSize = DTCMSize;
+#endif
     if (CP15Control & (1<<16))
     {
         DTCMBase = DTCMSetting & 0xFFFFF000;
@@ -109,10 +113,20 @@ void ARMv5::UpdateDTCMSetting()
         DTCMSize = 0;
         //printf("DTCM disabled\n");
     }
+#ifdef JIT_ENABLED
+    if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize)
+    {
+        ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize);
+        ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize);
+    }
+#endif
 }
 
 void ARMv5::UpdateITCMSetting()
 {
+#ifdef JIT_ENABLED
+    u32 oldITCMSize = ITCMSize;
+#endif
     if (CP15Control & (1<<18))
     {
         ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F);
@@ -123,6 +137,10 @@ void ARMv5::UpdateITCMSetting()
         ITCMSize = 0;
         //printf("ITCM disabled\n");
     }
+#ifdef JIT_ENABLED
+    if (oldITCMSize != ITCMSize)
+        ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize));
+#endif
 }
 
 
@@ -561,15 +579,9 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateAll();
-#endif
         ICacheInvalidateAll();
         return;
     case 0x751:
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
-#endif
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -732,7 +744,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
 
 void ARMv5::DataRead8(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     if (addr < ITCMSize)
     {
@@ -753,7 +765,7 @@ void ARMv5::DataRead8(u32 addr, u32* val)
 
 void ARMv5::DataRead16(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~1;
 
@@ -776,7 +788,7 @@ void ARMv5::DataRead16(u32 addr, u32* val)
 
 void ARMv5::DataRead32(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~3;
 
@@ -820,14 +832,14 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
 
 void ARMv5::DataWrite8(u32 addr, u8 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -844,7 +856,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
 
 void ARMv5::DataWrite16(u32 addr, u16 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~1;
 
@@ -853,7 +865,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -870,7 +882,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
 
 void ARMv5::DataWrite32(u32 addr, u32 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~3;
 
@@ -879,7 +891,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -903,7 +915,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 141c565..6e989a8 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -535,10 +535,6 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
-#ifdef JIT_ENABLED
-    ARMJIT::ResetBlockCache();
-#endif
-
     NDSCart::Reset();
     GBACart::Reset();
     GPU::Reset();
@@ -548,6 +544,10 @@ void Reset()
     Wifi::Reset();
 
     AREngine::Reset();
+
+#ifdef JIT_ENABLED
+    ARMJIT::Reset();
+#endif
 }
 
 void Stop()
@@ -1058,6 +1058,9 @@ void Halt()
 
 void MapSharedWRAM(u8 val)
 {
+    if (val == WRAMCnt)
+        return;
+
     WRAMCnt = val;
 
     switch (WRAMCnt & 0x3)
@@ -1090,6 +1093,11 @@ void MapSharedWRAM(u8 val)
         SWRAM_ARM7Mask = 0x7FFF;
         break;
     }
+
+#ifdef JIT_ENABLED
+    ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000);
+    ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000);
+#endif
 }
 
 
@@ -1873,12 +1881,18 @@ void ARM9Write8(u32 addr, u8 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -1923,12 +1937,18 @@ void ARM9Write16(u32 addr, u16 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -1949,7 +1969,12 @@ void ARM9Write16(u32 addr, u16 val)
         case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u16>(addr, val); return;
+        default:
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateLCDCIfNecessary(addr);
+#endif
+            GPU::WriteVRAM_LCDC<u16>(addr, val);
+            return;
         }
 
     case 0x07000000:
@@ -1989,12 +2014,18 @@ void ARM9Write32(u32 addr, u32 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return ;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -2015,7 +2046,12 @@ void ARM9Write32(u32 addr, u32 val)
         case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
+        default:
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateLCDCIfNecessary(addr);
+#endif
+            GPU::WriteVRAM_LCDC<u32>(addr, val);
+            return;
         }
 
     case 0x07000000:
@@ -2279,30 +2315,38 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2312,6 +2356,9 @@ void ARM7Write8(u32 addr, u8 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u8>(addr, val);
         return;
 
@@ -2342,30 +2389,38 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2383,6 +2438,9 @@ void ARM7Write16(u32 addr, u16 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u16>(addr, val);
         return;
 
@@ -2415,30 +2473,38 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val;
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2457,6 +2523,9 @@ void ARM7Write32(u32 addr, u32 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u32>(addr, val);
         return;
 
diff --git a/src/NDS.h b/src/NDS.h
index c7b455e..163260b 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -120,6 +120,14 @@ extern u8 ROMSeed1[2*8];
 extern u8 ARM9BIOS[0x1000];
 extern u8 ARM7BIOS[0x4000];
 
+extern u8 SharedWRAM[0x8000];
+extern u8* SWRAM_ARM9;
+extern u8* SWRAM_ARM7;
+extern u32 SWRAM_ARM9Mask;
+extern u32 SWRAM_ARM7Mask;
+
+extern u8 ARM7WRAM[0x10000];
+
 #define MAIN_RAM_SIZE 0x400000
 
 extern u8 MainRAM[MAIN_RAM_SIZE];
-- 
cgit v1.2.3


From 5a0b568647ae3a0d501ca1b915745fe708c9519f Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sat, 9 May 2020 14:34:52 +0200
Subject: allow allocating caller saved registers currently system-v only

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 19 ++----------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 58 ++++++++++++++++++++++++++-----------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  3 ++
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 19 ++++++++++++
 4 files changed, 65 insertions(+), 34 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cac590a..27c24c7 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -138,18 +138,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
-    if (restoreCPSR)
-    {
-        if (Thumb || CurInstr.Cond() >= 0xE)
-            RegCache.Flush();
-        else
-        {
-            // the ugly way...
-            // we only save them, to load and save them again
-            for (int reg : hiRegsLoaded)
-                SaveReg(reg, RegCache.Mapping[reg]);
-        }
-    }
+    PushRegs(restoreCPSR);
 
     MOV(64, R(ABI_PARAM1), R(RCPU));
     MOV(32, R(ABI_PARAM2), R(addr));
@@ -162,11 +151,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     else
         CALL((void*)&ARMv4::JumpTo);
 
-    if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
-    {
-        for (int reg : hiRegsLoaded)
-            LoadReg(reg, RegCache.Mapping[reg]);
-    }
+    PopRegs(restoreCPSR);
 
     LoadCPSR();
     // in case this instruction is skipped
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index eee2e0f..ef04601 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -26,7 +26,8 @@ const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 #ifdef _WIN32
     RBX, RSI, RDI, R12, R13, R14
 #else
-    RBX, R12, R13, R14 // this is sad
+    RBX, R12, R13, R14, // callee saved, this is sad
+    R9, R10, R11, // caller saved
 #endif
 };
 template <>
@@ -34,10 +35,46 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
     6
 #else
-    4
+    7
 #endif
 ;
 
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+
+    if (saveHiRegs)
+    {
+        BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+        for (int reg : hiRegsLoaded)
+        {
+            if (Thumb || CurInstr.Cond() == 0xE)
+                RegCache.UnloadRegister(reg);
+            else
+                SaveReg(reg, RegCache.Mapping[reg]);
+            // prevent saving the register twice
+            loadedRegs[reg] = false;
+        }
+    }
+
+    for (int reg : loadedRegs)
+        if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+            SaveReg(reg, RegCache.Mapping[reg]);
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+    for (int reg : loadedRegs)
+    {
+        if ((saveHiRegs && reg >= 8 && reg < 15)
+            || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+        {
+            LoadReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
 void Compiler::A_Comp_MRS()
 {
     Comp_AddCycles_C();
@@ -136,27 +173,14 @@ void Compiler::A_Comp_MSR()
             AND(32, R(RSCRATCH2), val);
             OR(32, R(RCPSR), R(RSCRATCH2));
 
-            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-            if (Thumb || CurInstr.Cond() >= 0xE)
-                RegCache.Flush();
-            else
-            {
-                // the ugly way...
-                // we only save them, to load and save them again
-                for (int reg : hiRegsLoaded)
-                    SaveReg(reg, RegCache.Mapping[reg]);
-            }
+            PushRegs(true);
 
             MOV(32, R(ABI_PARAM3), R(RCPSR));
             MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
             MOV(64, R(ABI_PARAM1), R(RCPU));
             CALL((void*)&ARM::UpdateMode);
 
-            if (!Thumb && CurInstr.Cond() < 0xE)
-            {
-                for (int reg : hiRegsLoaded)
-                    LoadReg(reg, RegCache.Mapping[reg]);
-            }
+            PopRegs(true);
         }
     }
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9df218b..f2fc301 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -168,6 +168,9 @@ public:
 
     Gen::FixupBranch CheckCondition(u32 cond);
 
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
     Gen::OpArg MapReg(int reg)
     {
         if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index c13b779..b27efdd 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -283,6 +283,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             }
             else
             {
+                PushRegs(false);
+
                 u32 maskedDataRegion;
 
                 if (addrIsStatic)
@@ -310,6 +312,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     MOV(32, R(ABI_PARAM2), rdMapped);
 
                     ABI_CallFunction((void(*)())func);
+
+                    PopRegs(false);
                 }
                 else
                 {
@@ -318,6 +322,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
                     ABI_CallFunction((void(*)())func);
 
+                    PopRegs(false);
+
                     if (!addrIsStatic)
                         MOV(32, R(RSCRATCH3), rdMapped);
 
@@ -352,6 +358,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         if (compileSlowPath)
         {
+            PushRegs(false);
+
             if (Num == 0)
             {
                 MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
@@ -402,6 +410,9 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     }
                 }
             }
+
+            PopRegs(false);
+
             if (!(flags & memop_Store))
             {
                 if (flags & memop_SignExtend)
@@ -561,6 +572,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     if (!store)
     {
+        PushRegs(false);
+
         MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
@@ -580,6 +593,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
         }
 
+        PopRegs(false);
+
         if (allocOffset)
             ADD(64, R(RSP), Imm8(allocOffset));
 
@@ -655,6 +670,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (allocOffset)
             SUB(64, R(RSP), Imm8(allocOffset));
 
+        PushRegs(false);
+
         MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
         if (allocOffset)
             LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
@@ -674,6 +691,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+    
+        PopRegs(false);
     }
 
     if (compileFastPath)
-- 
cgit v1.2.3


From d91bbec08fe79d8bfb1d3119cd7d03aa12624c82 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sat, 9 May 2020 14:36:18 +0200
Subject: use instr hash as key for restore candidates makes Golden Sun burn a
 little slower through the JIT memory

---
 src/ARMJIT.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 9602aed..8d87c76 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1042,13 +1042,13 @@ void CompileBlock(ARM* cpu)
 	u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4);
 	u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4);
 
-	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates.LookUp(instrHash);
 	bool mayRestore = true;
 	if (prevBlock)
 	{
-		RestoreCandidates.Remove(pseudoPhysicalAddr);
+		RestoreCandidates.Remove(instrHash);
 
-		mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash;
+		mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash;
 
 		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
@@ -1125,6 +1125,7 @@ void CompileBlock(ARM* cpu)
 void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
 	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
 
@@ -1203,7 +1204,7 @@ void InvalidateByAddr(u32 pseudoPhysical)
 
 		if (!literalInvalidation)
 		{
-			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->InstrHash, block);
 			if (prevBlock)
 				delete prevBlock;
 		}
-- 
cgit v1.2.3


From 4cff4b52286a7d1a7e40817d52a5d271a937ddc2 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 9 May 2020 15:39:39 +0200
Subject: allow allocating caller saved regs on windows

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index ef04601..fd3fb70 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -24,7 +24,8 @@ template <>
 const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
-    RBX, RSI, RDI, R12, R13, R14
+    RBX, RSI, RDI, R12, R13, R14, // callee saved
+    R10, R11, // caller saved
 #else
     RBX, R12, R13, R14, // callee saved, this is sad
     R9, R10, R11, // caller saved
@@ -33,7 +34,7 @@ const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 template <>
 const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
-    6
+    8
 #else
     7
 #endif
-- 
cgit v1.2.3


From aa6ff499f98dfbb5ca9aa8cac27fed813684eb45 Mon Sep 17 00:00:00 2001
From: Arisotura <thetotalworm@gmail.com>
Date: Fri, 9 Aug 2019 14:19:13 +0200
Subject: prepare JIT beta branch

---
 src/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/version.h b/src/version.h
index 6250601..9084606 100644
--- a/src/version.h
+++ b/src/version.h
@@ -19,7 +19,7 @@
 #ifndef VERSION_H
 #define VERSION_H
 
-#define MELONDS_VERSION    "0.8.3"
+#define MELONDS_VERSION    "0.8.3-JIT"
 
 #define MELONDS_URL        "http://melonds.kuribo64.net/"
 
-- 
cgit v1.2.3


From c692287ebab4dfdec16bb0a8ce338a4b6fe2d439 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 22 Jun 2019 01:28:32 +0200
Subject: JIT: base all instructions are interpreted

---
 src/ARM.cpp                        |   13 +-
 src/ARMJIT.cpp                     |  177 ++
 src/ARMJIT.h                       |  140 ++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  332 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   54 +
 src/ARM_InstrInfo.cpp              |  376 ++++
 src/ARM_InstrInfo.h                |  232 +++
 src/CMakeLists.txt                 |   12 +
 src/CP15.cpp                       |    7 +
 src/NDS.cpp                        |   17 +
 src/dolphin/Assert.h               |   47 +
 src/dolphin/BitSet.h               |  218 +++
 src/dolphin/CPUDetect.h            |   76 +
 src/dolphin/CodeBlock.h            |  121 ++
 src/dolphin/CommonFuncs.cpp        |   52 +
 src/dolphin/CommonFuncs.h          |   58 +
 src/dolphin/Intrinsics.h           |   72 +
 src/dolphin/Log.h                  |   20 +
 src/dolphin/MemoryUtil.cpp         |  193 ++
 src/dolphin/MemoryUtil.h           |   22 +
 src/dolphin/license_dolphin.txt    |  339 ++++
 src/dolphin/x64ABI.cpp             |  119 ++
 src/dolphin/x64ABI.h               |   57 +
 src/dolphin/x64CPUDetect.cpp       |  274 +++
 src/dolphin/x64Emitter.cpp         | 3398 ++++++++++++++++++++++++++++++++++++
 src/dolphin/x64Emitter.h           | 1180 +++++++++++++
 src/dolphin/x64Reg.h               |   96 +
 27 files changed, 7700 insertions(+), 2 deletions(-)
 create mode 100644 src/ARMJIT.cpp
 create mode 100644 src/ARMJIT.h
 create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.h
 create mode 100644 src/ARM_InstrInfo.cpp
 create mode 100644 src/ARM_InstrInfo.h
 create mode 100644 src/dolphin/Assert.h
 create mode 100644 src/dolphin/BitSet.h
 create mode 100644 src/dolphin/CPUDetect.h
 create mode 100644 src/dolphin/CodeBlock.h
 create mode 100644 src/dolphin/CommonFuncs.cpp
 create mode 100644 src/dolphin/CommonFuncs.h
 create mode 100644 src/dolphin/Intrinsics.h
 create mode 100644 src/dolphin/Log.h
 create mode 100644 src/dolphin/MemoryUtil.cpp
 create mode 100644 src/dolphin/MemoryUtil.h
 create mode 100644 src/dolphin/license_dolphin.txt
 create mode 100644 src/dolphin/x64ABI.cpp
 create mode 100644 src/dolphin/x64ABI.h
 create mode 100644 src/dolphin/x64CPUDetect.cpp
 create mode 100644 src/dolphin/x64Emitter.cpp
 create mode 100644 src/dolphin/x64Emitter.h
 create mode 100644 src/dolphin/x64Reg.h

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 68cac59..f2b92b4 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -22,6 +22,7 @@
 #include "ARM.h"
 #include "ARMInterpreter.h"
 #include "AREngine.h"
+#include "ARMJIT.h"
 
 
 // instruction timing notes
@@ -524,7 +525,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        if (CPSR & 0x20) // THUMB
+        /*if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -557,7 +558,15 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }
+        }*/
+
+        if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4));
+        if (block == NULL)
+            block = ARMJIT::CompileBlock(this);
+        Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
new file mode 100644
index 0000000..489cdcf
--- /dev/null
+++ b/src/ARMJIT.cpp
@@ -0,0 +1,177 @@
+#include "ARMJIT.h"
+
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+
+namespace ARMJIT
+{
+
+Compiler* compiler;
+BlockCache cache;
+
+
+#define DUP2(x) x, x
+
+static ptrdiff_t JIT_MEM[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
+		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
+		/* 4X*/	DUP2(-1),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/		 -1, 
+					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
+		/* 1X*/	DUP2(-1),
+		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
+		/* 3X*/	     offsetof(BlockCache, SWRAM),
+		             offsetof(BlockCache, ARM7_WRAM),
+		/* 4X*/	     -1,
+		             offsetof(BlockCache, ARM7_WIRAM),
+		/* 5X*/	DUP2(-1),
+		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(-1),
+		/* 8X*/	DUP2(-1),
+		/* 9X*/	DUP2(-1),
+		/* AX*/	DUP2(-1),
+		/* BX*/	DUP2(-1),
+		/* CX*/	DUP2(-1),
+		/* DX*/	DUP2(-1),
+		/* EX*/	DUP2(-1),
+		/* FX*/	DUP2(-1)
+		}
+};
+
+static u32 JIT_MASK[2][32] = {
+	//arm9
+	{
+		/* 0X*/	DUP2(0x00007FFF),
+		/* 1X*/	DUP2(0x00007FFF),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	DUP2(0x00007FFF),
+		/* 4X*/	DUP2(0x00000000),
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/		 0x00000000,
+					 0x000FFFFF,
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00007FFF)
+	},
+	//arm7
+	{
+		/* 0X*/	DUP2(0x00003FFF),
+		/* 1X*/	DUP2(0x00000000),
+		/* 2X*/	DUP2(0x003FFFFF),
+		/* 3X*/	     0x00007FFF,
+		             0x0000FFFF,
+		/* 4X*/	     0x00000000,
+		             0x0000FFFF,
+		/* 5X*/	DUP2(0x00000000),
+		/* 6X*/ DUP2(0x0003FFFF),
+		/* 7X*/	DUP2(0x00000000),
+		/* 8X*/	DUP2(0x00000000),
+		/* 9X*/	DUP2(0x00000000),
+		/* AX*/	DUP2(0x00000000),
+		/* BX*/	DUP2(0x00000000),
+		/* CX*/	DUP2(0x00000000),
+		/* DX*/	DUP2(0x00000000),
+		/* EX*/	DUP2(0x00000000),
+		/* FX*/	DUP2(0x00000000)
+		}
+};
+
+#undef DUP2
+
+
+void Init()
+{
+    memset(&cache, 0, sizeof(BlockCache));
+
+    for (int cpu = 0; cpu < 2; cpu++)
+        for (int i = 0; i < 0x4000; i++)
+            cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL :
+				(CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9])
+                + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1);
+
+	compiler = new Compiler();
+}
+
+void DeInit()
+{
+	delete compiler;
+}
+
+CompiledBlock CompileBlock(ARM* cpu)
+{
+    bool thumb = cpu->CPSR & 0x20;
+
+    FetchedInstr instrs[12];
+    int i = 0;
+    u32 r15 = cpu->R[15];
+    u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+    //printf("block %x %d\n", r15, thumb);
+    do
+    {
+        r15 += thumb ? 2 : 4;
+
+        instrs[i].Instr = nextInstr[0];
+        //printf("%x %x\n", instrs[i].Instr, r15);
+        instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+
+        if (cpu->Num == 0)
+        {
+            ARMv5* cpuv5 = (ARMv5*)cpu;
+            if (thumb && r15 & 0x2)
+            {
+                nextInstr[1] >>= 16;
+                instrs[i].CodeCycles = 0;
+            }
+            else
+            {
+                nextInstr[1] = cpuv5->CodeRead32(r15, false);
+                instrs[i].CodeCycles = cpu->CodeCycles;
+            }
+        }
+        else
+        {
+            ARMv4* cpuv4 = (ARMv4*)cpu;
+            if (thumb)
+                nextInstr[1] = cpuv4->CodeRead16(r15);
+            else
+                nextInstr[1] = cpuv4->CodeRead32(r15);
+            instrs[i].CodeCycles = cpu->CodeCycles;
+        }
+        instrs[i].NextInstr[1] = nextInstr[1];
+        instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
+
+        i++;
+    } while(!instrs[i - 1].Info.Branches() && i < 10);
+
+    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+
+    InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block);
+
+    return block;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
new file mode 100644
index 0000000..d718295
--- /dev/null
+++ b/src/ARMJIT.h
@@ -0,0 +1,140 @@
+#ifndef ARMJIT_H
+#define ARMJIT_H
+
+#include "types.h"
+
+#include <string.h>
+
+#include "ARM.h"
+#include "ARM_InstrInfo.h"
+
+namespace ARMJIT
+{
+
+typedef u32 (*CompiledBlock)();
+
+class RegCache
+{
+
+static const int NativeRegAllocOrder[];
+static const int NativeRegsCount;
+
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+    u32 Instr;
+    u32 NextInstr[2];
+
+    u8 CodeCycles;
+
+    ARMInstrInfo::Info Info;
+};
+
+/* 
+	Copied from DeSmuME
+	Some names where changed to match the nomenclature of melonDS
+
+	Since it's nowhere explained and atleast I needed some time to get behind it,
+	here's a summary on how it works:
+		more or less all memory locations from which code can be executed are
+		represented by an array of function pointers, which point to null or
+		a function which executes a block instructions starting from there.
+
+		The most significant 4 bits of each address is ignored. This 28 bit space is
+		divided into 0x4000 16 KB blocks, each of which a pointer to the relevant
+		place inside the before mentioned arrays. Only half of the bytes need to be
+		addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary).
+
+		In case a memory write hits mapped memory, the function block at this
+		address is set to null, so it's recompiled the next time it's executed.
+
+		This method has disadvantages, namely that only writing to the
+		first instruction of a block marks it as invalid and that memory remapping
+        (SWRAM and VRAM) isn't taken into account.
+*/
+
+struct BlockCache
+{
+    CompiledBlock* AddrMapping[2][0x4000] = {0};
+
+    CompiledBlock MainRAM[16*1024*1024/2];
+	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
+	CompiledBlock ARM9_ITCM[0x8000/2];
+	CompiledBlock ARM9_LCDC[0xA4000/2];
+	CompiledBlock ARM9_BIOS[0x8000/2];
+	CompiledBlock ARM7_BIOS[0x4000/2];
+	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
+	CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi
+	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
+};
+
+extern BlockCache cache;
+
+inline bool IsMapped(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+}
+
+inline CompiledBlock LookUpBlock(u32 num, u32 addr)
+{
+	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+}
+
+inline void Invalidate16(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+		cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+}
+
+inline void Invalidate32(u32 num, u32 addr)
+{
+	if (IsMapped(num, addr))
+	{
+		CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+		page[(addr & 0x3FFF) >> 1] = NULL;
+		page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+	}
+}
+
+inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
+{
+	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+}
+
+inline void ResetBlocks()
+{
+	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
+	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
+	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
+	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
+	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
+	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
+	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
+	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
+	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+}
+
+void Init();
+void DeInit();
+
+CompiledBlock CompileBlock(ARM* cpu);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..fb2fda8
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -0,0 +1,332 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include <assert.h>
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13};
+const int RegCache::NativeRegsCount = 5;
+
+Compiler::Compiler()
+{
+    AllocCodeSpace(1024 * 1024 * 4);
+}
+
+typedef void (Compiler::*CompileFunc)();
+typedef void (*InterpretFunc)(ARM*);
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+
+    MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
+}
+
+void Compiler::SaveCPSR()
+{
+    if (CPSRDirty)
+    {
+        MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
+        CPSRDirty = false;
+    }
+}
+
+CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+{
+    if (IsAlmostFull())
+    {
+        ResetBlocks();
+        ResetCodePtr();
+    }
+
+    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
+
+    ConstantCycles = 0;
+    Thumb = cpu->CPSR & 0x20;
+    Num = cpu->Num;
+    R15 = cpu->R[15];
+
+    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+
+    MOV(64, R(RCPU), ImmPtr(cpu));
+    XOR(32, R(RCycles), R(RCycles));
+
+    LoadCPSR();
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        R15 += Thumb ? 2 : 4;
+        CurrentInstr = instrs[i];
+
+        CompileFunc comp = NULL;
+
+        if (comp == NULL || i == instrsCount - 1)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr));
+            if (i == instrsCount - 1)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1]));
+            }
+
+            SaveCPSR();
+        }
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF;
+                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+            }
+            else
+            {
+            }
+        }
+        else
+        {
+            u32 cond = CurrentInstr.Cond();
+            if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+                ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+            }
+            else if (cond == 0xF)
+                AddCycles_C();
+            else
+            {
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                {
+                    if (cond >= 0x8)
+                    {
+                        static_assert(RSCRATCH3 == ECX);
+                        MOV(32, R(RSCRATCH3), R(RCPSR));
+                        SHR(32, R(RSCRATCH3), Imm8(28));
+                        MOV(32, R(RSCRATCH), Imm32(1));
+                        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+                        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+                    
+                        skipExecute = J_CC(CC_Z);
+                    }
+                    else
+                    {
+                        // could have used a LUT, but then where would be the fun?
+                        BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
+                        
+                        skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
+                    }
+                    
+                }
+
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                    u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0);
+                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                }
+                else
+                {
+                }
+
+                FixupBranch skipFailed;
+                if (CurrentInstr.Cond() < 0xE)
+                {
+                    skipFailed = J();
+                    SetJumpTarget(skipExecute);
+
+                    AddCycles_C();
+
+                    SetJumpTarget(skipFailed);
+                }
+            }
+        }
+
+        /*
+            we don't need to collect the interpreted cycles,
+            since all functions only add to it, the dispatcher
+            can take care of it.
+        */
+
+        if (comp == NULL && i != instrsCount - 1)
+            LoadCPSR();
+    }
+
+    SaveCPSR();
+
+    LEA(32, RAX, MDisp(RCycles, ConstantCycles));
+
+    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    RET();
+
+    return res;
+}
+
+void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
+{
+    const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+    {
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    };
+
+    const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL
+    };
+}
+
+void Compiler::AddCycles_C()
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles);
+
+    if (CurrentInstr.Cond() < 0xE)
+        ADD(32, R(RCycles), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+        case 0: // LSL
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHL(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+
+                return R(RSCRATCH);
+            }
+            else
+            {
+                carryUsed = false;
+                return R(rm);
+            }
+        case 1: // LSR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                SHR(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+                return R(RSCRATCH);
+            }
+            else
+            {
+                if (S)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                return Imm32(0);
+            }
+        case 2: // ASR
+            MOV(32, R(RSCRATCH), R(rm));
+            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            if (S)
+            {
+                if (amount == 0)
+                {
+                    MOV(32, R(RSCRATCH2), R(rm));
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                else
+                    SETcc(CC_C, R(RSCRATCH2));
+            }
+            return R(RSCRATCH);
+        case 3: // ROR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), R(rm));
+                ROR_(32, R(RSCRATCH), Imm8(amount));
+            }
+            else
+            {
+                BT(32, R(RCPSR), Imm8(29));
+                MOV(32, R(RSCRATCH), R(rm));
+                RCR(32, R(RSCRATCH), Imm8(1));
+            }
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+    }
+}
+
+void Compiler::A_Comp_ALU(const FetchedInstr& instr)
+{
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..8e1d100
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -0,0 +1,54 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../dolphin/x64Emitter.h"
+
+#include "../ARMJIT.h"
+
+
+namespace ARMJIT
+{
+
+const Gen::X64Reg RCPU = Gen::RBP;
+const Gen::X64Reg RCycles = Gen::R14;
+const Gen::X64Reg RCPSR = Gen::R15;
+
+const Gen::X64Reg RSCRATCH = Gen::EAX;
+const Gen::X64Reg RSCRATCH2 = Gen::EDX;
+const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+
+class Compiler : public Gen::X64CodeBlock
+{
+public:
+    Compiler();
+
+    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+
+    void StartBlock(ARM* cpu);
+    CompiledBlock FinaliseBlock();
+
+    void Compile(RegCache& regs, const FetchedInstr& instr);
+private:
+    void AddCycles_C();
+
+    Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed);
+
+    void A_Comp_ALU(const FetchedInstr& instr);
+
+    void LoadCPSR();
+    void SaveCPSR();
+
+    bool CPSRDirty = false;
+
+    FetchedInstr CurrentInstr;
+
+    bool Thumb;
+    u32 Num;
+    u32 R15;
+
+    u32 ConstantCycles;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
new file mode 100644
index 0000000..41c46e1
--- /dev/null
+++ b/src/ARM_InstrInfo.cpp
@@ -0,0 +1,376 @@
+#include "ARM_InstrInfo.h"
+
+#include <stdio.h>
+
+namespace ARMInstrInfo
+{
+
+#define ak(x) ((x) << 13)
+
+enum {
+    A_Read0             = 1 << 0,
+    A_Read16            = 1 << 1,
+    A_Read8             = 1 << 2,
+    A_Read12            = 1 << 3,
+
+    A_Write12           = 1 << 4,
+    A_Write16           = 1 << 5,
+    A_MemWriteback      = 1 << 6,
+
+    A_BranchAlways      = 1 << 7,
+
+    // for STRD/LDRD
+    A_Read12Double      = 1 << 8,
+    A_Write12Double     = 1 << 9,
+
+    A_Link              = 1 << 10,
+
+    A_LDMSTM            = 1 << 11,
+
+    A_ARM9Only          = 1 << 12,
+};
+
+#define A_BIOP A_Read16
+#define A_MONOOP 0
+
+#define A_IMPLEMENT_ALU_OP(x,k) \
+    const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+    \
+    const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP)
+A_IMPLEMENT_ALU_OP(EOR,BIOP)
+A_IMPLEMENT_ALU_OP(SUB,BIOP)
+A_IMPLEMENT_ALU_OP(RSB,BIOP)
+A_IMPLEMENT_ALU_OP(ADD,BIOP)
+A_IMPLEMENT_ALU_OP(ADC,BIOP)
+A_IMPLEMENT_ALU_OP(SBC,BIOP)
+A_IMPLEMENT_ALU_OP(RSC,BIOP)
+A_IMPLEMENT_ALU_OP(ORR,BIOP)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP)
+A_IMPLEMENT_ALU_OP(BIC,BIOP)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP)
+
+const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
+
+#define A_IMPLEMENT_ALU_TEST(x) \
+    const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST)
+A_IMPLEMENT_ALU_TEST(TEQ)
+A_IMPLEMENT_ALU_TEST(CMP)
+A_IMPLEMENT_ALU_TEST(CMN)
+
+const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
+const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
+const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
+const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
+
+const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ);
+
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB);
+
+#define A_LDR A_Write12
+#define A_STR A_Read12
+
+#define A_IMPLEMENT_WB_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
+    const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
+    const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
+    const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
+    const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+
+A_IMPLEMENT_WB_LDRSTR(STR,STR)
+A_IMPLEMENT_WB_LDRSTR(STRB,STR)
+A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
+A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
+
+#define A_LDRD A_Write12Double
+#define A_STRD A_Read12Double
+
+#define A_IMPLEMENT_HD_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
+
+A_IMPLEMENT_HD_LDRSTR(STRH,STR)
+A_IMPLEMENT_HD_LDRSTR(LDRD,LDRD)
+A_IMPLEMENT_HD_LDRSTR(STRD,STRD)
+A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
+
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
+
+const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM);
+
+const u32 A_B = A_BranchAlways | ak(ak_B);
+const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
+const u32 A_BLX_IMM = A_BranchAlways | A_Link | ak(ak_BLX_IMM);
+const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
+const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
+
+const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
+const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC);
+const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
+
+// THUMB
+
+#define tk(x) ((x) << 16)
+
+enum {
+    T_Read0         = 1 << 0,
+    T_Read3         = 1 << 1,
+    T_Read6         = 1 << 2,
+    T_Read8         = 1 << 3,
+
+    T_Write0        = 1 << 4,
+    T_Write8        = 1 << 5,
+
+    T_ReadHi0       = 1 << 6,
+    T_ReadHi3       = 1 << 7,
+    T_WriteHi0      = 1 << 8,
+
+    T_ReadR13       = 1 << 9,
+    T_WriteR13      = 1 << 10,
+    T_ReadR15       = 1 << 11,
+
+    T_BranchAlways  = 1 << 12,
+    T_ReadR14       = 1 << 13,
+    T_WriteR14      = 1 << 14,
+
+    T_PopPC         = 1 << 15
+};
+
+const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG);
+
+const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
+const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
+
+const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL);
+const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
+const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
+
+const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
+
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
+
+const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
+
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
+
+const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
+
+const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+
+const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
+const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
+const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_B = T_BranchAlways | tk(tk_B);
+const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
+
+const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC);
+
+#define INSTRFUNC_PROTO(x) u32 x
+#include "ARM_InstrTable.h"
+#undef INSTRFUNC_PROTO
+
+Info Decode(bool thumb, u32 num, u32 instr)
+{
+    Info res = {0};
+    if (thumb)
+    {
+        u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+
+        if (data & T_Read0)
+            res.SrcRegs |= 1 << (instr & 0x7);
+        if (data & T_Read3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0x7);
+        if (data & T_Read6)
+            res.SrcRegs |= 1 << ((instr >> 6) & 0x7);
+        if (data & T_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0x7);
+
+        if (data & T_Write0)
+            res.DstRegs |= 1 << (instr & 0x7);
+        if (data & T_Write8)
+            res.DstRegs |= 1 << ((instr >> 8) & 0x7);
+        
+        if (data & T_ReadHi0)
+            res.SrcRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+        if (data & T_ReadHi3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0xF);
+        if (data & T_WriteHi0)
+            res.DstRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+
+        if (data & T_ReadR13)
+            res.SrcRegs |= (1 << 13);
+        if (data & T_WriteR13)
+            res.DstRegs |= (1 << 13);
+        if (data & T_ReadR15)
+            res.SrcRegs |= (1 << 15);
+
+        if (data & T_BranchAlways)
+            res.DstRegs |= (1 << 15);
+
+        if (data & T_PopPC && instr & (1 << 8))
+            res.DstRegs |= 1 << 15;
+
+        res.Kind = (data >> 16) & 0x3F;
+
+        return res;
+    }
+    else
+    {
+        u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
+        if ((instr & 0xFE000000) == 0xFA000000)
+            data = A_BLX_IMM;
+
+        if (data & A_ARM9Only && num != 0)
+            data |= A_BranchAlways | A_Link;
+
+        if (data & A_Read0)
+            res.SrcRegs |= 1 << (instr & 0xF);
+        if (data & A_Read16)
+            res.SrcRegs |= 1 << ((instr >> 16) & 0xF);
+        if (data & A_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0xF);
+        if (data & A_Read12)
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+        
+        if (data & A_Write12)
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+        if (data & A_Write16)
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        
+        if (data & A_MemWriteback && instr & (1 << 21))
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+
+        if (data & A_BranchAlways)
+            res.DstRegs |= 1 << 15;
+        
+        if (data & A_Read12Double)
+        {
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+            res.SrcRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+        if (data & A_Write12Double)
+        {
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+            res.DstRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+
+        if (data & A_Link)
+        {
+            res.DstRegs |= 1 << 14;
+            res.SrcRegs |= 1 << 15;
+        }
+
+        if (data & A_LDMSTM)
+        {
+            res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15);
+            if (instr & (1 << 21))
+                res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        }
+
+        res.Kind = (data >> 13) & 0x1FF;
+
+        return res;
+    }
+}
+
+}
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
new file mode 100644
index 0000000..e717664
--- /dev/null
+++ b/src/ARM_InstrInfo.h
@@ -0,0 +1,232 @@
+#ifndef ARMINSTRINFO_H
+#define ARMINSTRINFO_H
+
+#include "types.h"
+
+namespace ARMInstrInfo
+{
+
+// Instruction kinds, for faster dispatch
+
+#define ak_ALU(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_REG_LSL_IMM_S, \
+    ak_##n##_REG_LSR_IMM_S, \
+    ak_##n##_REG_ASR_IMM_S, \
+    ak_##n##_REG_ROR_IMM_S, \
+    \
+    ak_##n##_REG_LSL_REG_S, \
+    ak_##n##_REG_LSR_REG_S, \
+    ak_##n##_REG_ASR_REG_S, \
+    ak_##n##_REG_ROR_REG_S, \
+    \
+    ak_##n##_IMM_S \
+
+#define ak_Test(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM
+
+#define ak_WB_LDRSTR(n) \
+    ak_##n##_REG_LSL, \
+    ak_##n##_REG_LSR, \
+    ak_##n##_REG_ASR, \
+    ak_##n##_REG_ROR, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG_LSL, \
+    ak_##n##_POST_REG_LSR, \
+    ak_##n##_POST_REG_ASR, \
+    ak_##n##_POST_REG_ROR, \
+    \
+    ak_##n##_POST_IMM
+
+#define ak_HD_LDRSTR(n) \
+    ak_##n##_REG, \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG, \
+    ak_##n##_POST_IMM
+
+enum
+{
+    ak_ALU(AND),
+    ak_ALU(EOR),
+    ak_ALU(SUB),
+    ak_ALU(RSB),
+    ak_ALU(ADD),
+    ak_ALU(ADC),
+    ak_ALU(SBC),
+    ak_ALU(RSC),
+    ak_ALU(ORR),
+    ak_ALU(MOV),
+    ak_ALU(BIC),
+    ak_ALU(MVN),
+
+    ak_ALU(TST),
+    ak_ALU(TEQ),
+    ak_ALU(CMP),
+    ak_ALU(CMN),
+
+    ak_MUL,
+    ak_MLA,
+    ak_UMULL,
+    ak_UMLAL,
+    ak_SMULL,
+    ak_SMLAL,
+    ak_SMLAxy,
+    ak_SMLAWy,
+    ak_SMULWy,
+    ak_SMLALxy,
+    ak_SMULxy,
+
+    ak_CLZ,
+
+    ak_QADD,
+    ak_QSUB,
+    ak_QDADD,
+    ak_QDSUB,
+
+    ak_WB_LDRSTR(STR),
+    ak_WB_LDRSTR(STRB),
+    ak_WB_LDRSTR(LDR),
+    ak_WB_LDRSTR(LDRB),
+
+    ak_HD_LDRSTR(STRH),
+    ak_HD_LDRSTR(LDRD),
+    ak_HD_LDRSTR(STRD),
+    ak_HD_LDRSTR(LDRH),
+    ak_HD_LDRSTR(LDRSB),
+    ak_HD_LDRSTR(LDRSH),
+
+    ak_SWP,
+    ak_SWPB,
+
+    ak_LDM,
+    ak_STM,
+
+    ak_B,
+    ak_BL,
+    ak_BLX_IMM,
+    ak_BX,
+    ak_BLX_REG,
+
+    ak_UNK,
+    ak_MSR_IMM,
+    ak_MSR_REG,
+    ak_MRS,
+    ak_MCR,
+    ak_MRC,
+    ak_SVC,
+
+    ak_Count,
+
+    tk_LSL_IMM = 0,
+    tk_LSR_IMM,
+    tk_ASR_IMM,
+
+    tk_ADD_REG_,
+    tk_SUB_REG_,
+    tk_ADD_IMM_,
+    tk_SUB_IMM_,
+
+    tk_MOV_IMM,
+    tk_CMP_IMM,
+    tk_ADD_IMM,
+    tk_SUB_IMM,
+
+    tk_AND_REG,
+    tk_EOR_REG,
+    tk_LSL_REG,
+    tk_LSR_REG,
+    tk_ASR_REG,
+    tk_ADC_REG,
+    tk_SBC_REG,
+    tk_ROR_REG,
+    tk_TST_REG,
+    tk_NEG_REG,
+    tk_CMP_REG,
+    tk_CMN_REG,
+    tk_ORR_REG,
+    tk_MUL_REG,
+    tk_BIC_REG,
+    tk_MVN_REG,
+
+    tk_ADD_HIREG,
+    tk_CMP_HIREG,
+    tk_MOV_HIREG,
+
+    tk_ADD_PCREL,
+    tk_ADD_SPREL,
+    tk_ADD_SP,
+
+    tk_LDR_PCREL,
+    tk_STR_REG,
+    tk_STRB_REG,
+    tk_LDR_REG,
+    tk_LDRB_REG,
+    tk_STRH_REG,
+    tk_LDRSB_REG,
+    tk_LDRH_REG,
+    tk_LDRSH_REG,
+    tk_STR_IMM,
+    tk_LDR_IMM,
+    tk_STRB_IMM,
+    tk_LDRB_IMM,
+    tk_STRH_IMM,
+    tk_LDRH_IMM,
+    tk_STR_SPREL,
+    tk_LDR_SPREL,
+
+    tk_PUSH,
+    tk_POP,
+    tk_LDMIA,
+    tk_STMIA,
+    tk_BCOND,
+    tk_BX,
+    tk_BLX_REG,
+    tk_B,
+    tk_BL_LONG_1,
+    tk_BL_LONG_2,
+    tk_UNK,
+    tk_SVC,
+
+    tk_Count
+};
+
+struct Info
+{
+    u16 DstRegs, SrcRegs;
+    u16 Kind;
+
+    bool Branches()
+    {
+        return DstRegs & (1 << 15);
+    }
+};
+
+Info Decode(bool thumb, u32 num, u32 instr);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 32fcac2..a6011e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 project(core)
 
+set (CMAKE_CXX_STANDARD 14)
+
 add_library(core STATIC
 	ARCodeList.cpp
 	AREngine.cpp
@@ -9,6 +11,7 @@ add_library(core STATIC
 	ARMInterpreter_ALU.cpp
 	ARMInterpreter_Branch.cpp
 	ARMInterpreter_LoadStore.cpp
+	ARM_InstrInfo.cpp
 	Config.cpp
 	CP15.cpp
 	CRC32.cpp
@@ -46,6 +49,15 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+
+	ARMJIT.cpp
+	ARMJIT_x64/ARMJIT_Compiler.cpp
+
+	dolphin/CommonFuncs.cpp
+	dolphin/x64ABI.cpp
+	dolphin/x64CPUDetect.cpp
+	dolphin/x64Emitter.cpp
+	dolphin/MemoryUtil.cpp
 )
 
 if (WIN32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index d340b9e..3e1c08b 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -21,6 +21,7 @@
 #include "NDS.h"
 #include "DSi.h"
 #include "ARM.h"
+#include "ARMJIT.h"
 
 
 // access timing for cached regions
@@ -812,6 +813,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -833,6 +835,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -854,6 +857,8 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -875,6 +880,8 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 22368ae..2a7edfd 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -32,6 +32,7 @@
 #include "Wifi.h"
 #include "AREngine.h"
 #include "Platform.h"
+#include "ARMJIT.h"
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -168,6 +169,8 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+    ARMJIT::Init();
+
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
     DMAs[2] = new DMA(0, 2);
@@ -200,6 +203,8 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+    ARMJIT::DeInit();
+
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
 
@@ -1971,6 +1976,8 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2021,6 +2028,8 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2087,6 +2096,8 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(0, addr);
+
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2381,6 +2392,8 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2440,6 +2453,8 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+    ARMJIT::Invalidate16(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
@@ -2509,6 +2524,8 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+    ARMJIT::Invalidate32(1, addr);
+
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h
new file mode 100644
index 0000000..4eb16e0
--- /dev/null
+++ b/src/dolphin/Assert.h
@@ -0,0 +1,47 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <assert.h>
+
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h
new file mode 100644
index 0000000..d32b020
--- /dev/null
+++ b/src/dolphin/BitSet.h
@@ -0,0 +1,218 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "../types.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+namespace Common
+{
+template <typename T>
+constexpr int CountSetBits(T v)
+{
+  // from https://graphics.stanford.edu/~seander/bithacks.html
+  // GCC has this built in, but MSVC's intrinsic will only emit the actual
+  // POPCNT instruction, which we're not depending on
+  v = v - ((v >> 1) & (T) ~(T)0 / 3);
+  v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3);
+  v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15;
+  return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8;
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  unsigned long index;
+  _BitScanForward64(&index, val);
+  return (int)index;
+}
+#else
+namespace Common
+{
+constexpr int CountSetBits(u8 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u16 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u32 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u64 val)
+{
+  return __builtin_popcountll(val);
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  return __builtin_ctzll(val);
+}
+#endif
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+  static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+
+public:
+  // A reference to a particular bit, returned from operator[].
+  class Ref
+  {
+  public:
+    constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+    constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+    constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+    bool operator=(bool set)
+    {
+      m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+      return set;
+    }
+
+  private:
+    BitSet* m_bs;
+    IntTy m_mask;
+  };
+
+  // A STL-like iterator is required to be able to use range-based for loops.
+  class Iterator
+  {
+  public:
+    constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+    constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+    Iterator& operator=(Iterator other)
+    {
+      new (this) Iterator(other);
+      return *this;
+    }
+    Iterator& operator++()
+    {
+      if (m_val == 0)
+      {
+        m_bit = -1;
+      }
+      else
+      {
+        int bit = LeastSignificantSetBit(m_val);
+        m_val &= ~(1 << bit);
+        m_bit = bit;
+      }
+      return *this;
+    }
+    Iterator operator++(int)
+    {
+      Iterator other(*this);
+      ++*this;
+      return other;
+    }
+    constexpr int operator*() const { return m_bit; }
+    constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+    constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+
+  private:
+    IntTy m_val;
+    int m_bit;
+  };
+
+  constexpr BitSet() : m_val(0) {}
+  constexpr explicit BitSet(IntTy val) : m_val(val) {}
+  BitSet(std::initializer_list<int> init)
+  {
+    m_val = 0;
+    for (int bit : init)
+      m_val |= (IntTy)1 << bit;
+  }
+
+  constexpr static BitSet AllTrue(size_t count)
+  {
+    return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+  }
+
+  Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+  constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+  constexpr bool operator==(BitSet other) const { return m_val == other.m_val; }
+  constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; }
+  constexpr bool operator<(BitSet other) const { return m_val < other.m_val; }
+  constexpr bool operator>(BitSet other) const { return m_val > other.m_val; }
+  constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+  constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+  constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+  constexpr BitSet operator~() const { return BitSet(~m_val); }
+  constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); }
+  constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); }
+  constexpr explicit operator bool() const { return m_val != 0; }
+  BitSet& operator|=(BitSet other) { return *this = *this | other; }
+  BitSet& operator&=(BitSet other) { return *this = *this & other; }
+  BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+  BitSet& operator<<=(IntTy shift) { return *this = *this << shift; }
+  BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; }
+  // Warning: Even though on modern CPUs this is a single fast instruction,
+  // Dolphin's official builds do not currently assume POPCNT support on x86,
+  // so slower explicit bit twiddling is generated.  Still should generally
+  // be faster than a loop.
+  constexpr unsigned int Count() const { return CountSetBits(m_val); }
+  constexpr Iterator begin() const { return ++Iterator(m_val, 0); }
+  constexpr Iterator end() const { return Iterator(m_val, -1); }
+  IntTy m_val;
+};
+}  // namespace Common
+
+using BitSet8 = Common::BitSet<u8>;
+using BitSet16 = Common::BitSet<u16>;
+using BitSet32 = Common::BitSet<u32>;
+using BitSet64 = Common::BitSet<u64>;
diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h
new file mode 100644
index 0000000..bd4fd8d
--- /dev/null
+++ b/src/dolphin/CPUDetect.h
@@ -0,0 +1,76 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// Detect the CPU, so we'll know which optimizations to use
+#pragma once
+
+#include <string>
+
+enum class CPUVendor
+{
+  Intel,
+  AMD,
+  ARM,
+  Other,
+};
+
+struct CPUInfo
+{
+  CPUVendor vendor = CPUVendor::Intel;
+
+  char cpu_string[0x41] = {};
+  char brand_string[0x21] = {};
+  bool OS64bit = false;
+  bool CPU64bit = false;
+  bool Mode64bit = false;
+
+  bool HTT = false;
+  int num_cores = 0;
+  int logical_cpu_count = 0;
+
+  bool bSSE = false;
+  bool bSSE2 = false;
+  bool bSSE3 = false;
+  bool bSSSE3 = false;
+  bool bPOPCNT = false;
+  bool bSSE4_1 = false;
+  bool bSSE4_2 = false;
+  bool bLZCNT = false;
+  bool bSSE4A = false;
+  bool bAVX = false;
+  bool bAVX2 = false;
+  bool bBMI1 = false;
+  bool bBMI2 = false;
+  bool bFMA = false;
+  bool bFMA4 = false;
+  bool bAES = false;
+  // FXSAVE/FXRSTOR
+  bool bFXSR = false;
+  bool bMOVBE = false;
+  // This flag indicates that the hardware supports some mode
+  // in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+  bool bFlushToZero = false;
+  bool bLAHFSAHF64 = false;
+  bool bLongMode = false;
+  bool bAtom = false;
+
+  // ARMv8 specific
+  bool bFP = false;
+  bool bASIMD = false;
+  bool bCRC32 = false;
+  bool bSHA1 = false;
+  bool bSHA2 = false;
+
+  // Call Detect()
+  explicit CPUInfo();
+
+  // Turn the CPU info into a string we can show
+  std::string Summarize();
+
+private:
+  // Detects the various CPU features
+  void Detect();
+};
+
+extern CPUInfo cpu_info;
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
new file mode 100644
index 0000000..1434297
--- /dev/null
+++ b/src/dolphin/CodeBlock.h
@@ -0,0 +1,121 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include "Assert.h"
+#include "../types.h"
+#include "MemoryUtil.h"
+
+namespace Common
+{
+// Everything that needs to generate code should inherit from this.
+// You get memory management for free, plus, you can use all emitter functions without
+// having to prefix them with gen-> or something similar.
+// Example implementation:
+// class JIT : public CodeBlock<ARMXEmitter> {}
+template <class T>
+class CodeBlock : public T
+{
+private:
+  // A privately used function to set the executable RAM space to something invalid.
+  // For debugging usefulness it should be used to set the RAM to a host specific breakpoint
+  // instruction
+  virtual void PoisonMemory() = 0;
+
+protected:
+  u8* region = nullptr;
+  // Size of region we can use.
+  size_t region_size = 0;
+  // Original size of the region we allocated.
+  size_t total_region_size = 0;
+
+  bool m_is_child = false;
+  std::vector<CodeBlock*> m_children;
+
+public:
+  CodeBlock() = default;
+  virtual ~CodeBlock()
+  {
+    if (region)
+      FreeCodeSpace();
+  }
+  CodeBlock(const CodeBlock&) = delete;
+  CodeBlock& operator=(const CodeBlock&) = delete;
+  CodeBlock(CodeBlock&&) = delete;
+  CodeBlock& operator=(CodeBlock&&) = delete;
+
+  // Call this before you generate any code.
+  void AllocCodeSpace(size_t size)
+  {
+    region_size = size;
+    total_region_size = size;
+    region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
+    T::SetCodePtr(region);
+  }
+
+  // Always clear code space with breakpoints, so that if someone accidentally executes
+  // uninitialized, it just breaks into the debugger.
+  void ClearCodeSpace()
+  {
+    PoisonMemory();
+    ResetCodePtr();
+  }
+
+  // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+  void FreeCodeSpace()
+  {
+    ASSERT(!m_is_child);
+    Common::FreeMemoryPages(region, total_region_size);
+    region = nullptr;
+    region_size = 0;
+    total_region_size = 0;
+    for (CodeBlock* child : m_children)
+    {
+      child->region = nullptr;
+      child->region_size = 0;
+      child->total_region_size = 0;
+    }
+  }
+
+  bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
+  // Cannot currently be undone. Will write protect the entire code region.
+  // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+  void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
+  void ResetCodePtr() { T::SetCodePtr(region); }
+  size_t GetSpaceLeft() const
+  {
+    ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
+    return region_size - (T::GetCodePtr() - region);
+  }
+
+  bool IsAlmostFull() const
+  {
+    // This should be bigger than the biggest block ever.
+    return GetSpaceLeft() < 0x10000;
+  }
+
+  bool HasChildren() const { return region_size != total_region_size; }
+  u8* AllocChildCodeSpace(size_t child_size)
+  {
+    ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
+    u8* child_region = region + region_size - child_size;
+    region_size -= child_size;
+    return child_region;
+  }
+  void AddChildCodeSpace(CodeBlock* child, size_t child_size)
+  {
+    u8* child_region = AllocChildCodeSpace(child_size);
+    child->m_is_child = true;
+    child->region = child_region;
+    child->region_size = child_size;
+    child->total_region_size = child_size;
+    child->ResetCodePtr();
+    m_children.emplace_back(child);
+  }
+};
+}  // namespace Common
diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp
new file mode 100644
index 0000000..f85051d
--- /dev/null
+++ b/src/dolphin/CommonFuncs.cpp
@@ -0,0 +1,52 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include <errno.h>
+#include <type_traits>
+
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define strerror_r(err, buf, len) strerror_s(buf, len, err)
+#endif
+
+constexpr size_t BUFFER_SIZE = 256;
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  // There are two variants of strerror_r. The XSI version stores the message to the passed-in
+  // buffer and returns an int (0 on success). The GNU version returns a pointer to the message,
+  // which might have been stored in the passed-in buffer or might be a static string.
+
+  // We check defines in order to figure out variant is in use, and we store the returned value
+  // to a variable so that we'll get a compile-time check that our assumption was correct.
+
+#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600))
+  const char* str = strerror_r(errno, error_message, BUFFER_SIZE);
+  return std::string(str);
+#else
+  int error_code = strerror_r(errno, error_message, BUFFER_SIZE);
+  return error_code == 0 ? std::string(error_message) : "";
+#endif
+}
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(),
+                 MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr);
+  return std::string(error_message);
+}
+#endif
diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h
new file mode 100644
index 0000000..708fbc3
--- /dev/null
+++ b/src/dolphin/CommonFuncs.h
@@ -0,0 +1,58 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "../types.h"
+
+// Will fail to compile on a non-array:
+template <typename T, size_t N>
+constexpr size_t ArraySize(T (&arr)[N])
+{
+  return N;
+}
+
+#ifndef _WIN32
+
+// go to debugger mode
+#define Crash()                                                                                    \
+  {                                                                                                \
+    __builtin_trap();                                                                              \
+  }
+
+#else  // WIN32
+// Function Cross-Compatibility
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define unlink _unlink
+#define vscprintf _vscprintf
+
+// 64 bit offsets for Windows
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define atoll _atoi64
+#define stat _stat64
+#define fstat _fstat64
+#define fileno _fileno
+
+extern "C" {
+__declspec(dllimport) void __stdcall DebugBreak(void);
+}
+#define Crash()                                                                                    \
+  {                                                                                                \
+    DebugBreak();                                                                                  \
+  }
+#endif  // WIN32 ndef
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString();
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString();
+#endif
diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h
new file mode 100644
index 0000000..483f219
--- /dev/null
+++ b/src/dolphin/Intrinsics.h
@@ -0,0 +1,72 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#if defined(_M_X86)
+
+/**
+ * It is assumed that all compilers used to build Dolphin support intrinsics up to and including
+ * SSE 4.2 on x86/x64.
+ */
+
+#if defined(__GNUC__) || defined(__clang__)
+
+/**
+ * Due to limitations in GCC, SSE intrinsics are only available when compiling with the
+ * corresponding instruction set enabled. However, using the target attribute, we can compile
+ * single functions with a different target instruction set, while still creating a generic build.
+ *
+ * Since this instruction set is enabled per-function, any callers should verify that the
+ * instruction set is supported at runtime before calling it, and provide a fallback implementation
+ * when not supported.
+ *
+ * When building with -march=native, or enabling the instruction sets in the compile flags, permit
+ * usage of the instrinsics without any function attributes. If the command-line architecture does
+ * not support this instruction set, enable it via function targeting.
+ */
+
+#include <x86intrin.h>
+#ifndef __SSE4_2__
+#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]]
+#endif
+#ifndef __SSE4_1__
+#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]]
+#endif
+#ifndef __SSSE3__
+#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]]
+#endif
+#ifndef __SSE3__
+#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]]
+#endif
+
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+/**
+ * MSVC and ICC support intrinsics for any instruction set without any function attributes.
+ */
+#include <intrin.h>
+
+#endif  // defined(_MSC_VER) || defined(__INTEL_COMPILER)
+
+#endif  // _M_X86
+
+/**
+ * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform.
+ * This way when a function is defined with FUNCTION_TARGET you don't need to define a second
+ * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use
+ * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures.
+ */
+#ifndef FUNCTION_TARGET_SSE42
+#define FUNCTION_TARGET_SSE42
+#endif
+#ifndef FUNCTION_TARGET_SSR41
+#define FUNCTION_TARGET_SSR41
+#endif
+#ifndef FUNCTION_TARGET_SSSE3
+#define FUNCTION_TARGET_SSSE3
+#endif
+#ifndef FUNCTION_TARGET_SSE3
+#define FUNCTION_TARGET_SSE3
+#endif
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
new file mode 100644
index 0000000..21e69a5
--- /dev/null
+++ b/src/dolphin/Log.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "CommonFuncs.h"
+
+#include <stdio.h>
+
+#define PanicAlert(msg) \
+    do \
+    { \
+        printf("%s\n", msg); \
+        Crash(); \
+    } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
new file mode 100644
index 0000000..01cb897
--- /dev/null
+++ b/src/dolphin/MemoryUtil.cpp
@@ -0,0 +1,193 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#include "../types.h"
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+//#include "Common/StringUtil.h"
+#else
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#if defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+#include <sys/sysctl.h>
+#elif defined __HAIKU__
+#include <OS.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+namespace Common
+{
+// This is purposely not a full wrapper for virtualalloc/mmap, but it
+// provides exactly the primitive operations that Dolphin needs.
+
+void* AllocateExecutableMemory(size_t size)
+{
+  printf("c\n");
+
+#if defined(_WIN32)
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+  void* ptr =
+      mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+  printf("a\n");
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate executable memory");
+
+  printf("b\n");
+
+  return ptr;
+}
+
+void* AllocateMemoryPages(size_t size)
+{
+#ifdef _WIN32
+  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+  if (ptr == MAP_FAILED)
+    ptr = nullptr;
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate raw memory");
+
+  return ptr;
+}
+
+void* AllocateAlignedMemory(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+  void* ptr = _aligned_malloc(size, alignment);
+#else
+  void* ptr = nullptr;
+  if (posix_memalign(&ptr, alignment, size) != 0)
+    ERROR_LOG(MEMMAP, "Failed to allocate aligned memory");
+#endif
+
+  if (ptr == nullptr)
+    PanicAlert("Failed to allocate aligned memory");
+
+  return ptr;
+}
+
+void FreeMemoryPages(void* ptr, size_t size)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    if (!VirtualFree(ptr, 0, MEM_RELEASE))
+      PanicAlert("FreeMemoryPages failed!\nVirtualFree: %s", GetLastErrorString().c_str());
+#else
+    if (munmap(ptr, size) != 0)
+      PanicAlert("FreeMemoryPages failed!\nmunmap: %s", LastStrerrorString().c_str());
+#endif
+  }
+}
+
+void FreeAlignedMemory(void* ptr)
+{
+  if (ptr)
+  {
+#ifdef _WIN32
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+  }
+}
+
+void ReadProtectMemory(void* ptr, size_t size)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue))
+    PanicAlert("ReadProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, PROT_NONE) != 0)
+    PanicAlert("ReadProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, &oldValue))
+    PanicAlert("WriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size, allowExecute ? (PROT_READ | PROT_EXEC) : PROT_READ) != 0)
+    PanicAlert("WriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+#endif
+}
+
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
+{
+#ifdef _WIN32
+  DWORD oldValue;
+  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldValue))
+    PanicAlert("UnWriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
+#else
+  if (mprotect(ptr, size,
+               allowExecute ? (PROT_READ | PROT_WRITE | PROT_EXEC) : PROT_WRITE | PROT_READ) != 0)
+  {
+    PanicAlert("UnWriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
+  }
+#endif
+}
+
+size_t MemPhysical()
+{
+#ifdef _WIN32
+  MEMORYSTATUSEX memInfo;
+  memInfo.dwLength = sizeof(MEMORYSTATUSEX);
+  GlobalMemoryStatusEx(&memInfo);
+  return memInfo.ullTotalPhys;
+#elif defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
+  int mib[2];
+  size_t physical_memory;
+  mib[0] = CTL_HW;
+#ifdef __APPLE__
+  mib[1] = HW_MEMSIZE;
+#elif defined __FreeBSD__
+  mib[1] = HW_REALMEM;
+#elif defined __OpenBSD__
+  mib[1] = HW_PHYSMEM;
+#endif
+  size_t length = sizeof(size_t);
+  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+  return physical_memory;
+#elif defined __HAIKU__
+  system_info sysinfo;
+  get_system_info(&sysinfo);
+  return static_cast<size_t>(sysinfo.max_pages * B_PAGE_SIZE);
+#else
+  struct sysinfo memInfo;
+  sysinfo(&memInfo);
+  return (size_t)memInfo.totalram * memInfo.mem_unit;
+#endif
+}
+
+}  // namespace Common
diff --git a/src/dolphin/MemoryUtil.h b/src/dolphin/MemoryUtil.h
new file mode 100644
index 0000000..607b7a8
--- /dev/null
+++ b/src/dolphin/MemoryUtil.h
@@ -0,0 +1,22 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+namespace Common
+{
+void* AllocateExecutableMemory(size_t size);
+void* AllocateMemoryPages(size_t size);
+void FreeMemoryPages(void* ptr, size_t size);
+void* AllocateAlignedMemory(size_t size, size_t alignment);
+void FreeAlignedMemory(void* ptr);
+void ReadProtectMemory(void* ptr, size_t size);
+void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false);
+size_t MemPhysical();
+
+}  // namespace Common
diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/src/dolphin/license_dolphin.txt
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp
new file mode 100644
index 0000000..d86a158
--- /dev/null
+++ b/src/dolphin/x64ABI.cpp
@@ -0,0 +1,119 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include "../types.h"
+#include "x64ABI.h"
+#include "x64Emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                      size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+  size_t shadow = 0;
+#if defined(_WIN32)
+  shadow = 0x20;
+#endif
+
+  int count = (mask & ABI_ALL_GPRS).Count();
+  rsp_alignment -= count * 8;
+  size_t subtraction = 0;
+  int fpr_count = (mask & ABI_ALL_FPRS).Count();
+  if (fpr_count)
+  {
+    // If we have any XMMs to save, we must align the stack here.
+    subtraction = rsp_alignment & 0xf;
+  }
+  subtraction += 16 * fpr_count;
+  size_t xmm_base_subtraction = subtraction;
+  subtraction += needed_frame_size;
+  subtraction += shadow;
+  // Final alignment.
+  rsp_alignment -= subtraction;
+  subtraction += rsp_alignment & 0xf;
+
+  *shadowp = shadow;
+  *subtractionp = subtraction;
+  *xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                                 size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int r : mask& ABI_ALL_GPRS)
+    PUSH((X64Reg)r);
+
+  if (subtraction)
+    SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
+    xmm_offset += 16;
+  }
+
+  return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                              size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset));
+    xmm_offset += 16;
+  }
+
+  if (subtraction)
+    ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int r = 15; r >= 0; r--)
+  {
+    if (mask[r])
+      POP((X64Reg)r);
+  }
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2,
+                      Gen::X64Reg src2)
+{
+  if (dst1 == src2 && dst2 == src1)
+  {
+    XCHG(bits, R(src1), R(src2));
+    if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+  else if (src2 != dst1)
+  {
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+  }
+  else
+  {
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+}
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
new file mode 100644
index 0000000..997782e
--- /dev/null
+++ b/src/dolphin/x64ABI.h
@@ -0,0 +1,57 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include "BitSet.h"
+#include "x64Reg.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself
+// calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch:      RAX RCX RDX R8 R9 R10 R11
+// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters:   RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save:  RBX RBP R12 R13 R14 R15
+// Parameters:   RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32  // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED                                                                       \
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11})
+#else  // 64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS)
+#endif  // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
new file mode 100644
index 0000000..05ee11c
--- /dev/null
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -0,0 +1,274 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstring>
+#include <string>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Intrinsics.h"
+
+#ifndef _MSVC_VER
+
+#ifdef __FreeBSD__
+#include <unistd.h>
+
+#include <machine/cpufunc.h>
+#include <sys/types.h>
+#endif
+
+static inline void __cpuidex(int info[4], int function_id, int subfunction_id)
+{
+#ifdef __FreeBSD__
+  // Despite the name, this is just do_cpuid() with ECX as second input.
+  cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
+#else
+  info[0] = function_id;     // eax
+  info[2] = subfunction_id;  // ecx
+  __asm__("cpuid"
+          : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+          : "a"(function_id), "c"(subfunction_id));
+#endif
+}
+
+static inline void __cpuid(int info[4], int function_id)
+{
+  return __cpuidex(info, function_id, 0);
+}
+
+#endif  // ifndef _WIN32
+
+#ifdef _MSVC_VER
+
+static u64 xgetbv(u32 index)
+{
+  return _xgetbv(index);
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK;
+
+#else
+
+static u64 xgetbv(u32 index)
+{
+  u32 eax, edx;
+  __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+  return ((u64)edx << 32) | eax;
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0;
+#endif  // ifdef _WIN32
+
+CPUInfo cpu_info;
+
+CPUInfo::CPUInfo()
+{
+  Detect();
+}
+
+// Detects the various CPU features
+void CPUInfo::Detect()
+{
+#ifdef _M_X86_64
+  Mode64bit = true;
+  OS64bit = true;
+#endif
+  num_cores = 1;
+
+  // Set obvious defaults, for extra safety
+  if (Mode64bit)
+  {
+    bSSE = true;
+    bSSE2 = true;
+    bLongMode = true;
+  }
+
+  // Assume CPU supports the CPUID instruction. Those that don't can barely
+  // boot modern OS:es anyway.
+  int cpu_id[4];
+
+  // Detect CPU's CPUID capabilities, and grab CPU string
+  __cpuid(cpu_id, 0x00000000);
+  u32 max_std_fn = cpu_id[0];  // EAX
+  std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int));
+  std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int));
+  std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int));
+  __cpuid(cpu_id, 0x80000000);
+  u32 max_ex_fn = cpu_id[0];
+  if (!strcmp(brand_string, "GenuineIntel"))
+    vendor = CPUVendor::Intel;
+  else if (!strcmp(brand_string, "AuthenticAMD"))
+    vendor = CPUVendor::AMD;
+  else
+    vendor = CPUVendor::Other;
+
+  // Set reasonable default brand string even if brand string not available.
+  strcpy(cpu_string, brand_string);
+
+  // Detect family and other misc stuff.
+  bool ht = false;
+  HTT = ht;
+  logical_cpu_count = 1;
+  if (max_std_fn >= 1)
+  {
+    __cpuid(cpu_id, 0x00000001);
+    int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+    int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+    // Detect people unfortunate enough to be running Dolphin on an Atom
+    if (family == 6 &&
+        (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
+         model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+      bAtom = true;
+    logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
+    ht = (cpu_id[3] >> 28) & 1;
+
+    if ((cpu_id[3] >> 25) & 1)
+      bSSE = true;
+    if ((cpu_id[3] >> 26) & 1)
+      bSSE2 = true;
+    if ((cpu_id[2]) & 1)
+      bSSE3 = true;
+    if ((cpu_id[2] >> 9) & 1)
+      bSSSE3 = true;
+    if ((cpu_id[2] >> 19) & 1)
+      bSSE4_1 = true;
+    if ((cpu_id[2] >> 20) & 1)
+      bSSE4_2 = true;
+    if ((cpu_id[2] >> 22) & 1)
+      bMOVBE = true;
+    if ((cpu_id[2] >> 25) & 1)
+      bAES = true;
+
+    if ((cpu_id[3] >> 24) & 1)
+    {
+      // We can use FXSAVE.
+      bFXSR = true;
+    }
+
+    // AVX support requires 3 separate checks:
+    //  - Is the AVX bit set in CPUID?
+    //  - Is the XSAVE bit set in CPUID?
+    //  - XGETBV result has the XCR bit set.
+    if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+    {
+      if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+      {
+        bAVX = true;
+        if ((cpu_id[2] >> 12) & 1)
+          bFMA = true;
+      }
+    }
+
+    if (max_std_fn >= 7)
+    {
+      __cpuidex(cpu_id, 0x00000007, 0x00000000);
+      // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+      if ((cpu_id[1] >> 5) & 1)
+        bAVX2 = bAVX;
+      if ((cpu_id[1] >> 3) & 1)
+        bBMI1 = true;
+      if ((cpu_id[1] >> 8) & 1)
+        bBMI2 = true;
+    }
+  }
+
+  bFlushToZero = bSSE;
+
+  if (max_ex_fn >= 0x80000004)
+  {
+    // Extract CPU model string
+    __cpuid(cpu_id, 0x80000002);
+    memcpy(cpu_string, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000003);
+    memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000004);
+    memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id));
+  }
+  if (max_ex_fn >= 0x80000001)
+  {
+    // Check for more features.
+    __cpuid(cpu_id, 0x80000001);
+    if (cpu_id[2] & 1)
+      bLAHFSAHF64 = true;
+    if ((cpu_id[2] >> 5) & 1)
+      bLZCNT = true;
+    if ((cpu_id[2] >> 16) & 1)
+      bFMA4 = true;
+    if ((cpu_id[3] >> 29) & 1)
+      bLongMode = true;
+  }
+
+  num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count;
+
+  if (max_ex_fn >= 0x80000008)
+  {
+    // Get number of cores. This is a bit complicated. Following AMD manual here.
+    __cpuid(cpu_id, 0x80000008);
+    int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF;
+    if (apic_id_core_id_size == 0)
+    {
+      if (ht)
+      {
+        // New mechanism for modern Intel CPUs.
+        if (vendor == CPUVendor::Intel)
+        {
+          __cpuidex(cpu_id, 0x00000004, 0x00000000);
+          int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1;
+          HTT = (cores_x_package < logical_cpu_count);
+          cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1;
+          num_cores = (cores_x_package > 1) ? cores_x_package : num_cores;
+          logical_cpu_count /= cores_x_package;
+        }
+      }
+    }
+    else
+    {
+      // Use AMD's new method.
+      num_cores = (cpu_id[2] & 0xFF) + 1;
+    }
+  }
+}
+
+// Turn the CPU info into a string we can show
+std::string CPUInfo::Summarize()
+{
+  std::string sum(cpu_string);
+  sum += " (";
+  sum += brand_string;
+  sum += ")";
+
+  if (bSSE)
+    sum += ", SSE";
+  if (bSSE2)
+  {
+    sum += ", SSE2";
+    if (!bFlushToZero)
+      sum += " (but not DAZ!)";
+  }
+  if (bSSE3)
+    sum += ", SSE3";
+  if (bSSSE3)
+    sum += ", SSSE3";
+  if (bSSE4_1)
+    sum += ", SSE4.1";
+  if (bSSE4_2)
+    sum += ", SSE4.2";
+  if (HTT)
+    sum += ", HTT";
+  if (bAVX)
+    sum += ", AVX";
+  if (bAVX2)
+    sum += ", AVX2";
+  if (bBMI1)
+    sum += ", BMI1";
+  if (bBMI2)
+    sum += ", BMI2";
+  if (bFMA)
+    sum += ", FMA";
+  if (bAES)
+    sum += ", AES";
+  if (bMOVBE)
+    sum += ", MOVBE";
+  if (bLongMode)
+    sum += ", 64-bit support";
+  return sum;
+}
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
new file mode 100644
index 0000000..7849624
--- /dev/null
+++ b/src/dolphin/x64Emitter.cpp
@@ -0,0 +1,3398 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "Log.h"
+#include "x64Emitter.h"
+#include "x64Reg.h"
+
+namespace Gen
+{
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+  u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] = {
+    {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0},  // ADD
+    {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2},  // ADC
+
+    {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5},  // SUB
+    {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3},  // SBB
+
+    {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4},  // AND
+    {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1},  // OR
+
+    {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6},  // XOR
+    {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0},  // MOV
+
+    {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0},  // TEST (to == from)
+    {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7},  // CMP
+
+    {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7},  // XCHG
+};
+
+enum NormalSSEOps
+{
+  sseCMP = 0xC2,
+  sseADD = 0x58,   // ADD
+  sseSUB = 0x5C,   // SUB
+  sseAND = 0x54,   // AND
+  sseANDN = 0x55,  // ANDN
+  sseOR = 0x56,
+  sseXOR = 0x57,
+  sseMUL = 0x59,          // MUL
+  sseDIV = 0x5E,          // DIV
+  sseMIN = 0x5D,          // MIN
+  sseMAX = 0x5F,          // MAX
+  sseCOMIS = 0x2F,        // COMIS
+  sseUCOMIS = 0x2E,       // UCOMIS
+  sseSQRT = 0x51,         // SQRT
+  sseRCP = 0x53,          // RCP
+  sseRSQRT = 0x52,        // RSQRT (NO DOUBLE PRECISION!!!)
+  sseMOVAPfromRM = 0x28,  // MOVAP from RM
+  sseMOVAPtoRM = 0x29,    // MOVAP to RM
+  sseMOVUPfromRM = 0x10,  // MOVUP from RM
+  sseMOVUPtoRM = 0x11,    // MOVUP to RM
+  sseMOVLPfromRM = 0x12,
+  sseMOVLPtoRM = 0x13,
+  sseMOVHPfromRM = 0x16,
+  sseMOVHPtoRM = 0x17,
+  sseMOVHLPS = 0x12,
+  sseMOVLHPS = 0x16,
+  sseMOVDQfromRM = 0x6F,
+  sseMOVDQtoRM = 0x7F,
+  sseMASKMOVDQU = 0xF7,
+  sseLDDQU = 0xF0,
+  sseSHUF = 0xC6,
+  sseMOVNTDQ = 0xE7,
+  sseMOVNTP = 0x2B,
+};
+
+enum class NormalOp
+{
+  ADD,
+  ADC,
+  SUB,
+  SBB,
+  AND,
+  OR,
+  XOR,
+  MOV,
+  TEST,
+  CMP,
+  XCHG,
+};
+
+enum class FloatOp
+{
+  LD = 0,
+  ST = 2,
+  STP = 3,
+  LD80 = 5,
+  STP80 = 7,
+
+  Invalid = -1,
+};
+
+void XEmitter::SetCodePtr(u8* ptr)
+{
+  code = ptr;
+}
+
+const u8* XEmitter::GetCodePtr() const
+{
+  return code;
+}
+
+u8* XEmitter::GetWritableCodePtr()
+{
+  return code;
+}
+
+void XEmitter::Write8(u8 value)
+{
+  *code++ = value;
+}
+
+void XEmitter::Write16(u16 value)
+{
+  std::memcpy(code, &value, sizeof(u16));
+  code += sizeof(u16);
+}
+
+void XEmitter::Write32(u32 value)
+{
+  std::memcpy(code, &value, sizeof(u32));
+  code += sizeof(u32);
+}
+
+void XEmitter::Write64(u64 value)
+{
+  std::memcpy(code, &value, sizeof(u64));
+  code += sizeof(u64);
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+  for (int i = 0; i < bytes; i++)
+    *code++ = 0xCC;
+}
+
+u8* XEmitter::AlignCodeTo(size_t alignment)
+{
+  ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0,
+             "Alignment must be power of two");
+  u64 c = reinterpret_cast<u64>(code) & (alignment - 1);
+  if (c)
+    ReserveCodeSpace(static_cast<int>(alignment - c));
+  return code;
+}
+
+u8* XEmitter::AlignCode4()
+{
+  return AlignCodeTo(4);
+}
+
+u8* XEmitter::AlignCode16()
+{
+  return AlignCodeTo(16);
+}
+
+u8* XEmitter::AlignCodePage()
+{
+  return AlignCodeTo(4096);
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+  ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+  Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+  Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const
+{
+  if (customOp == -1)
+    customOp = operandReg;
+  u8 op = 0x40;
+  // REX.W (whether operation is a 64-bit operation)
+  if (opBits == 64)
+    op |= 8;
+  // REX.R (whether ModR/M reg field refers to R8-R15.
+  if (customOp & 8)
+    op |= 4;
+  // REX.X (whether ModR/M SIB index field refers to R8-R15)
+  if (indexReg & 8)
+    op |= 2;
+  // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+  if (offsetOrBaseReg & 8)
+    op |= 1;
+  // Write REX if wr have REX bits to write, or if the operation accesses
+  // SIL, DIL, BPL, or SPL.
+  if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+      (opBits == 8 && (customOp & 0x10c) == 4))
+  {
+    emit->Write8(op);
+    // Check the operation doesn't access AH, BH, CH, or DH.
+    DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
+    DEBUG_ASSERT((customOp & 0x100) == 0);
+  }
+}
+
+void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                     int W) const
+{
+  int R = !(regOp1 & 8);
+  int X = !(indexReg & 8);
+  int B = !(offsetOrBaseReg & 8);
+
+  int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+  // do we need any VEX fields that only appear in the three-byte form?
+  if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+  {
+    u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC5);
+    emit->Write8(RvvvvLpp);
+  }
+  else
+  {
+    u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+    u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC4);
+    emit->Write8(RXBmmmmm);
+    emit->Write8(WvvvvLpp);
+  }
+}
+
+void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
+                      bool warn_64bit_offset) const
+{
+  if (_operandReg == INVALID_REG)
+    _operandReg = (X64Reg)this->operandReg;
+  int mod = 0;
+  int ireg = indexReg;
+  bool SIB = false;
+  int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+  if (scale == SCALE_RIP)  // Also, on 32-bit, just an immediate address
+  {
+    // Oh, RIP addressing.
+    _offsetOrBaseReg = 5;
+    emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+    // TODO : add some checks
+    u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+    s64 distance = (s64)offset - (s64)ripAddr;
+    ASSERT_MSG(DYNA_REC,
+               (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset,
+               "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset);
+    s32 offs = (s32)distance;
+    emit->Write32((u32)offs);
+    return;
+  }
+
+  if (scale == 0)
+  {
+    // Oh, no memory, Just a reg.
+    mod = 3;  // 11
+  }
+  else
+  {
+    // Ah good, no scaling.
+    if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+    {
+      // Okay, we're good. No SIB necessary.
+      int ioff = (int)offset;
+      if (ioff == 0)
+      {
+        mod = 0;
+      }
+      else if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+    else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+    {
+      SIB = true;
+      mod = 0;
+      _offsetOrBaseReg = 5;
+    }
+    else
+    {
+      if ((_offsetOrBaseReg & 7) == 4)  // this would occupy the SIB encoding :(
+      {
+        // So we have to fake it with SIB encoding :(
+        SIB = true;
+      }
+
+      if (scale >= SCALE_1 && scale < SCALE_ATREG)
+      {
+        SIB = true;
+      }
+
+      if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+      {
+        SIB = true;
+        ireg = _offsetOrBaseReg;
+      }
+
+      // Okay, we're fine. Just disp encoding.
+      // We need displacement. Which size?
+      int ioff = (int)(s64)offset;
+      if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+  }
+
+  // Okay. Time to do the actual writing
+  // ModRM byte:
+  int oreg = _offsetOrBaseReg;
+  if (SIB)
+    oreg = 4;
+
+  emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
+
+  if (SIB)
+  {
+    // SIB byte
+    int ss;
+    switch (scale)
+    {
+    case SCALE_NONE:
+      _offsetOrBaseReg = 4;
+      ss = 0;
+      break;  // RSP
+    case SCALE_1:
+      ss = 0;
+      break;
+    case SCALE_2:
+      ss = 1;
+      break;
+    case SCALE_4:
+      ss = 2;
+      break;
+    case SCALE_8:
+      ss = 3;
+      break;
+    case SCALE_NOBASE_2:
+      ss = 1;
+      break;
+    case SCALE_NOBASE_4:
+      ss = 2;
+      break;
+    case SCALE_NOBASE_8:
+      ss = 3;
+      break;
+    case SCALE_ATREG:
+      ss = 0;
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte");
+      ss = 0;
+      break;
+    }
+    emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7)));
+  }
+
+  if (mod == 1)  // 8-bit disp
+  {
+    emit->Write8((u8)(s8)(s32)offset);
+  }
+  else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8))  // 32-bit disp
+  {
+    emit->Write32((u32)offset);
+  }
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+  w = w ? 1 : 0;
+  r = r ? 1 : 0;
+  x = x ? 1 : 0;
+  b = b ? 1 : 0;
+  u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+  if (rx != 0x40)
+    Write8(rx);
+}
+
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
+{
+  u64 fn = (u64)addr;
+  if (!force5Bytes)
+  {
+    s64 distance = (s64)(fn - ((u64)code + 2));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    // 8 bits will do
+    Write8(0xEB);
+    Write8((u8)(s8)distance);
+  }
+  else
+  {
+    s64 distance = (s64)(fn - ((u64)code + 5));
+
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0xE9);
+    Write32((u32)(s32)distance);
+  }
+}
+
+void XEmitter::JMPptr(const OpArg& arg2)
+{
+  OpArg arg = arg2;
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument");
+  arg.operandReg = 4;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+// Can be used to trap other processors, before overwriting their code
+// not used in Dolphin
+void XEmitter::JMPself()
+{
+  Write8(0xEB);
+  Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument");
+  arg.operandReg = 2;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void* fnptr)
+{
+  u64 distance = u64(fnptr) - (u64(code) + 5);
+  ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL,
+             "CALL out of range (%p calls %p)", code, fnptr);
+  Write8(0xE8);
+  Write32(u32(distance));
+}
+
+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = FixupBranch::Type::Branch32Bit;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 5 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0xEB);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0xE9);
+    Write32(0);
+  }
+  return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 6 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0x70 + conditionCode);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32(0);
+  }
+  return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+  u64 fn = (u64)addr;
+  s64 distance = (s64)(fn - ((u64)code + 2));
+  if (distance < -0x80 || distance >= 0x80)
+  {
+    distance = (s64)(fn - ((u64)code + 6));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32((u32)(s32)distance);
+  }
+  else
+  {
+    Write8(0x70 + conditionCode);
+    Write8((u8)(s8)distance);
+  }
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
+{
+  if (branch.type == FixupBranch::Type::Branch8Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    if (!(distance >= -0x80 && distance < 0x80))
+    {
+      printf("miauz\n");
+    }
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    branch.ptr[-1] = (u8)(s8)distance;
+  }
+  else if (branch.type == FixupBranch::Type::Branch32Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+
+    s32 valid_distance = static_cast<s32>(distance);
+    std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32));
+  }
+}
+
+// Single byte opcodes
+// There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3()
+{
+  Write8(0xCC);
+}
+void XEmitter::RET()
+{
+  Write8(0xC3);
+}
+void XEmitter::RET_FAST()
+{
+  Write8(0xF3);
+  Write8(0xC3);
+}  // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to
+   // a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+  DEBUG_ASSERT((int)size > 0);
+  while (true)
+  {
+    switch (size)
+    {
+    case 0:
+      return;
+    case 1:
+      Write8(0x90);
+      return;
+    case 2:
+      Write8(0x66);
+      Write8(0x90);
+      return;
+    case 3:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x00);
+      return;
+    case 4:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x40);
+      Write8(0x00);
+      return;
+    case 5:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 6:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 7:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x80);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 8:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 9:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 10:
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    default:
+      // Even though x86 instructions are allowed to be up to 15 bytes long,
+      // AMD advises against using NOPs longer than 11 bytes because they
+      // carry a performance penalty on CPUs older than AMD family 16h.
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      size -= 11;
+      continue;
+    }
+  }
+}
+
+void XEmitter::PAUSE()
+{
+  Write8(0xF3);
+  NOP();
+}  // use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()
+{
+  CheckFlags();
+  Write8(0xF8);
+}  // clear carry
+void XEmitter::CMC()
+{
+  CheckFlags();
+  Write8(0xF5);
+}  // flip carry
+void XEmitter::STC()
+{
+  CheckFlags();
+  Write8(0xF9);
+}  // set carry
+
+// TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+  Write8(0x86);
+  Write8(0xe0);
+  // alt. 86 c4
+}
+
+// These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF()
+{
+  Write8(0x9F);
+}
+void XEmitter::SAHF()
+{
+  CheckFlags();
+  Write8(0x9E);
+}
+
+void XEmitter::PUSHF()
+{
+  Write8(0x9C);
+}
+void XEmitter::POPF()
+{
+  CheckFlags();
+  Write8(0x9D);
+}
+
+void XEmitter::LFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xE8);
+}
+void XEmitter::MFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF0);
+}
+void XEmitter::SFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF8);
+}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte1);
+  Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, 0);
+  Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+  if (bits == 8)
+    Write8(0x66);
+  Rex(bits == 32, 0, 0, 0);
+  Write8(0x98);
+}
+
+// Simple opcodes
+
+// push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x50, reg);
+}
+void XEmitter::POP(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x58, reg);
+}
+
+void XEmitter::PUSH(int bits, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    PUSH(reg.GetSimpleReg());
+  else if (reg.IsImm())
+  {
+    switch (reg.GetImmBits())
+    {
+    case 8:
+      Write8(0x6A);
+      Write8((u8)(s8)reg.offset);
+      break;
+    case 16:
+      Write8(0x66);
+      Write8(0x68);
+      Write16((u16)(s16)(s32)reg.offset);
+      break;
+    case 32:
+      Write8(0x68);
+      Write32((u32)reg.offset);
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits");
+      break;
+    }
+  }
+  else
+  {
+    if (bits == 16)
+      Write8(0x66);
+    reg.WriteREX(this, bits, bits);
+    Write8(0xFF);
+    reg.WriteRest(this, 0, (X64Reg)6);
+  }
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    POP(reg.GetSimpleReg());
+  else
+    ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+  if (bits >= 32)
+  {
+    WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+  }
+  else if (bits == 16)
+  {
+    ROL(16, R(reg), Imm8(8));
+  }
+  else if (bits == 8)
+  {
+    // Do nothing - can't bswap a single byte...
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+  }
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+  Write8(0x0F);
+  Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+  ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+  arg.operandReg = (u8)level;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0x18);
+  arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+  ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+  dest.operandReg = 0;
+  dest.WriteREX(this, 0, 8);
+  Write8(0x0F);
+  Write8(0x90 + (u8)flag);
+  dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+  ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+  if (bits == 16)
+    Write8(0x66);
+  src.operandReg = dest;
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(0x40 + (u8)flag);
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+  CheckFlags();
+  src.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  src.WriteREX(this, bits, bits, 0);
+  if (bits == 8)
+  {
+    Write8(0xF6);
+  }
+  else
+  {
+    Write8(0xF7);
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 4);
+}
+void XEmitter::DIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 6);
+}
+void XEmitter::IMUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 5);
+}
+void XEmitter::IDIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 7);
+}
+void XEmitter::NEG(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 3);
+}
+void XEmitter::NOT(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 2);
+}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+  CheckFlags();
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);
+  if (rep)
+    Write8(0xF3);
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(byte2);
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
+{
+  if (bits <= 16)
+    ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16");
+  WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBC);
+}  // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBD);
+}  // Top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bLZCNT)
+    PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  src.WriteREX(this, dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xBE);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xBF);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x63);
+  }
+  else
+  {
+    Crash();
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  // the 32bit result is automatically zero extended to 64bit
+  src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xB6);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xB7);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x8B);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size");
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
+{
+  ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+  if (bits == 8)
+  {
+    MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg);
+    return;
+  }
+  if (bits == 16)
+    Write8(0x66);
+  ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!");
+  arg.WriteREX(this, bits, bits, reg);
+  Write8(0x0F);
+  Write8(0x38);
+  Write8(op);
+  arg.WriteRest(this, 0, reg);
+}
+void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteMOVBE(bits, 0xF0, dest, src);
+}
+void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src)
+{
+  WriteMOVBE(bits, 0xF1, src, dest);
+}
+
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info)
+{
+  if (info)
+  {
+    info->address = GetWritableCodePtr();
+    info->nonAtomicSwapStore = false;
+  }
+
+  switch (size)
+  {
+  case 8:
+    if (sign_extend)
+      MOVSX(32, 8, dst, src);
+    else
+      MOVZX(32, 8, dst, src);
+    break;
+  case 16:
+    MOVZX(32, 16, dst, src);
+    if (sign_extend)
+    {
+      BSWAP(32, dst);
+      SAR(32, R(dst), Imm8(16));
+    }
+    else
+    {
+      ROL(16, R(dst), Imm8(8));
+    }
+    break;
+  case 32:
+  case 64:
+    if (cpu_info.bMOVBE)
+    {
+      MOVBE(size, dst, src);
+    }
+    else
+    {
+      MOV(size, R(dst), src);
+      BSWAP(size, dst);
+    }
+    break;
+  }
+}
+
+void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info)
+{
+  if (cpu_info.bMOVBE)
+  {
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = false;
+    }
+    MOVBE(size, dst, src);
+  }
+  else
+  {
+    BSWAP(size, src);
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = true;
+      info->nonAtomicSwapStoreSrc = src;
+    }
+    MOV(size, dst, R(src));
+  }
+}
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);  // TODO: performance warning
+  src.WriteREX(this, bits, bits);
+  Write8(0x8D);
+  src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
+{
+  CheckFlags();
+  bool writeImm = false;
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument");
+  }
+  dest.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  dest.WriteREX(this, bits, bits, 0);
+  if (shift.GetImmBits() == 8)
+  {
+    // ok an imm
+    u8 imm = (u8)shift.offset;
+    if (imm == 1)
+    {
+      Write8(bits == 8 ? 0xD0 : 0xD1);
+    }
+    else
+    {
+      writeImm = true;
+      Write8(bits == 8 ? 0xC0 : 0xC1);
+    }
+  }
+  else
+  {
+    Write8(bits == 8 ? 0xD2 : 0xD3);
+  }
+  dest.WriteRest(this, writeImm ? 1 : 0);
+  if (writeImm)
+    Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on Intel than AMD
+// Intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 0);
+}
+void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 1);
+}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 2);
+}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 3);
+}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 4);
+}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 5);
+}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 7);
+}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms");
+  }
+  if ((index.IsImm() && index.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  if (index.IsImm())
+  {
+    dest.WriteREX(this, bits, bits);
+    Write8(0x0F);
+    Write8(0xBA);
+    dest.WriteRest(this, 1, (X64Reg)ext);
+    Write8((u8)index.offset);
+  }
+  else
+  {
+    X64Reg operand = index.GetSimpleReg();
+    dest.WriteREX(this, bits, bits, operand);
+    Write8(0x0F);
+    Write8(0x83 + 8 * ext);
+    dest.WriteRest(this, 1, operand);
+  }
+}
+
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 4);
+}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 5);
+}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 6);
+}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 7);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xAC);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xAD);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xA4);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xA5);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits)
+{
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  this->operandReg = (u8)_operandReg;
+  WriteREX(emit, bits, bits);
+  emit->Write8(op);
+  WriteRest(emit);
+}
+
+// operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+                          int bits) const
+{
+  X64Reg _operandReg;
+  if (IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+  }
+
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  int immToWrite = 0;
+  const NormalOpDef& op_def = normalops[static_cast<int>(op)];
+
+  if (operand.IsImm())
+  {
+    WriteREX(emit, bits, bits);
+
+    if (!toRM)
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+    }
+
+    if (operand.scale == SCALE_IMM8 && bits == 8)
+    {
+      // op al, imm8
+      if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC)
+      {
+        emit->Write8(op_def.eaximm8);
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // mov reg, imm8
+      if (!scale && op == NormalOp::MOV)
+      {
+        emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // op r/m8, imm8
+      emit->Write8(op_def.imm8);
+      immToWrite = 8;
+    }
+    else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+             (operand.scale == SCALE_IMM32 && bits == 32) ||
+             (operand.scale == SCALE_IMM32 && bits == 64))
+    {
+      // Try to save immediate size if we can, but first check to see
+      // if the instruction supports simm8.
+      // op r/m, imm8
+      if (op_def.simm8 != 0xCC &&
+          ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+           (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+      {
+        emit->Write8(op_def.simm8);
+        immToWrite = 8;
+      }
+      else
+      {
+        // mov reg, imm
+        if (!scale && op == NormalOp::MOV && bits != 64)
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op eax, imm
+        if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC)
+        {
+          emit->Write8(op_def.eaximm32);
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op r/m, imm
+        emit->Write8(op_def.imm32);
+        immToWrite = bits == 16 ? 16 : 32;
+      }
+    }
+    else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+             (operand.scale == SCALE_IMM8 && bits == 32) ||
+             (operand.scale == SCALE_IMM8 && bits == 64))
+    {
+      // op r/m, imm8
+      emit->Write8(op_def.simm8);
+      immToWrite = 8;
+    }
+    else if (operand.scale == SCALE_IMM64 && bits == 64)
+    {
+      if (scale)
+      {
+        ASSERT_MSG(DYNA_REC, 0,
+                   "WriteNormalOp - MOV with 64-bit imm requires register destination");
+      }
+      // mov reg64, imm64
+      else if (op == NormalOp::MOV)
+      {
+        // movabs reg64, imm64 (10 bytes)
+        if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset))
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          emit->Write64(operand.offset);
+          return;
+        }
+        // mov reg64, simm32 (7 bytes)
+        emit->Write8(op_def.imm32);
+        immToWrite = 32;
+      }
+      else
+      {
+        ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+      }
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+    }
+
+    // pass extension in REG of ModRM
+    _operandReg = static_cast<X64Reg>(op_def.ext);
+  }
+  else
+  {
+    _operandReg = (X64Reg)operand.offsetOrBaseReg;
+    WriteREX(emit, bits, bits, _operandReg);
+    // op r/m, reg
+    if (toRM)
+    {
+      emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32);
+    }
+    // op reg, r/m
+    else
+    {
+      emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32);
+    }
+  }
+  WriteRest(emit, immToWrite >> 3, _operandReg);
+  switch (immToWrite)
+  {
+  case 0:
+    break;
+  case 8:
+    emit->Write8((u8)operand.offset);
+    break;
+  case 16:
+    emit->Write16((u16)operand.offset);
+    break;
+  case 32:
+    emit->Write32((u32)operand.offset);
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+  }
+}
+
+void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
+{
+  if (a1.IsImm())
+  {
+    // Booh! Can't write to an imm
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+    return;
+  }
+  if (a2.IsImm())
+  {
+    a1.WriteNormalOp(this, true, op, a2, bits);
+  }
+  else
+  {
+    if (a1.IsSimpleReg())
+    {
+      a2.WriteNormalOp(this, false, op, a1, bits);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(),
+                 "WriteNormalOp - a1 and a2 cannot both be memory");
+      a1.WriteNormalOp(this, true, op, a2, bits);
+    }
+  }
+}
+
+void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADD, a1, a2);
+}
+void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADC, a1, a2);
+}
+void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SUB, a1, a2);
+}
+void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SBB, a1, a2);
+}
+void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::AND, a1, a2);
+}
+void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::OR, a1, a2);
+}
+void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::XOR, a1, a2);
+}
+void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2)
+{
+  if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 &&
+      a2.offset == static_cast<u32>(a2.offset))
+  {
+    WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32());
+    return;
+  }
+  if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+  {
+    ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+  }
+  WriteNormalOp(bits, NormalOp::MOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::TEST, a1, a2);
+}
+void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2)
+{
+  WriteNormalOp(bits, NormalOp::XCHG, a1, a2);
+}
+void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (a1.IsSimpleReg() && a2.IsZero())  // turn 'CMP reg, 0' into shorter 'TEST reg, reg'
+  {
+    WriteNormalOp(bits, NormalOp::TEST, a1, a1);
+  }
+  else
+  {
+    WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+  }
+}
+
+void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2)
+{
+  // This stomps on flags, so ensure they aren't locked
+  DEBUG_ASSERT(!flags_locked);
+
+  // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero
+  // or a2 == dest && a1 == zero)
+  if (a1.IsZero())
+  {
+    if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a2);
+    }
+    return;
+  }
+  if (a2.IsZero())
+  {
+    if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a1);
+    }
+    return;
+  }
+
+  // If dest == a1 or dest == a2 we can simplify this
+  if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a2);
+    return;
+  }
+
+  if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a1);
+    return;
+  }
+
+  // TODO: 32-bit optimizations may apply to other bit sizes (confirm)
+  if (bits == 32)
+  {
+    if (a1.IsImm() && a2.IsImm())
+    {
+      MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsImm())
+    {
+      LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsImm() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32()));
+      return;
+    }
+  }
+
+  // Fallback
+  MOV(bits, R(dest), a1);
+  ADD(bits, R(dest), a2);
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a1.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+    return;
+  }
+
+  if (!a2.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!");
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a1.WriteREX(this, bits, bits, regOp);
+
+  if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+      (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+  {
+    Write8(0x6B);
+    a1.WriteRest(this, 1, regOp);
+    Write8((u8)a2.offset);
+  }
+  else
+  {
+    Write8(0x69);
+    if (a2.GetImmBits() == 16 && bits == 16)
+    {
+      a1.WriteRest(this, 2, regOp);
+      Write16((u16)a2.offset);
+    }
+    else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+    {
+      a1.WriteRest(this, 4, regOp);
+      Write32((u32)a2.offset);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!");
+    }
+  }
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a.IsImm())
+  {
+    IMUL(bits, regOp, R(regOp), a);
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a.WriteREX(this, bits, bits, regOp);
+  Write8(0x0F);
+  Write8(0xAF);
+  a.WriteRest(this, 0, regOp);
+}
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+  if (opPrefix)
+    Write8(opPrefix);
+  arg.operandReg = regOp;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  if (op > 0xFF)
+    Write8((op >> 8) & 0xFF);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+  // Currently, only 0x38 and 0x3A are used as secondary escape byte.
+  if ((op >> 8) == 0x3A)
+    return 3;
+  else if ((op >> 8) == 0x38)
+    return 2;
+  else
+    return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+  if (opPrefix == 0x66)
+    return 1;
+  else if (opPrefix == 0xF3)
+    return 2;
+  else if (opPrefix == 0xF2)
+    return 3;
+  else
+    return 0;
+}
+
+void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  int mmmmm = GetVEXmmmmm(op);
+  int pp = GetVEXpp(opPrefix);
+  // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+  arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1);
+  Write8((u8)regOp3 << 4);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
+}
+
+void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
+{
+  if (!cpu_info.bFMA)
+    PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
+}
+
+void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           int W)
+{
+  if (!cpu_info.bFMA4)
+    PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
+}
+
+void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                          const OpArg& arg, int extrabytes)
+{
+  if (arg.IsImm())
+    PanicAlert("BMI1/2 instructions don't support immediate operands.");
+  if (size != 32 && size != 64)
+    PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!");
+  int W = size == 64;
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bBMI2)
+    PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6E, dest, arg, 0);
+}
+void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src)
+{
+  WriteSSEOp(0x66, 0x7E, src, arg, 0);
+}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+  // Alternate encoding
+  // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+  arg.operandReg = dest;
+  Write8(0x66);
+  arg.WriteREX(this, 64, 0);
+  Write8(0x0f);
+  Write8(0x6E);
+  arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+  if (src > 7 || arg.IsSimpleReg())
+  {
+    // Alternate encoding
+    // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+    arg.operandReg = src;
+    Write8(0x66);
+    arg.WriteREX(this, 64, 0);
+    Write8(0x0f);
+    Write8(0x7E);
+    arg.WriteRest(this, 0);
+  }
+  else
+  {
+    arg.operandReg = src;
+    arg.WriteREX(this, 0, 0);
+    Write8(0x66);
+    Write8(0x0f);
+    Write8(0xD6);
+    arg.WriteRest(this, 0);
+  }
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+  if (arg.IsImm() || arg.IsSimpleReg())
+    ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand");
+
+  arg.operandReg = ext;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0xAE);
+  arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 3);
+}
+void XEmitter::LDMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 2);
+}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);
+}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVNTP, regOp, arg);
+}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTP, regOp, arg);
+}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseADD, regOp, arg);
+}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseADD, regOp, arg);
+}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSUB, regOp, arg);
+}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSUB, regOp, arg);
+}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF3, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF2, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMUL, regOp, arg);
+}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMUL, regOp, arg);
+}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseDIV, regOp, arg);
+}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseDIV, regOp, arg);
+}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMIN, regOp, arg);
+}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMIN, regOp, arg);
+}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMAX, regOp, arg);
+}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRSQRT, regOp, arg);
+}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseADD, regOp, arg);
+}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseADD, regOp, arg);
+}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSUB, regOp, arg);
+}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSUB, regOp, arg);
+}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x00, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x66, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseAND, regOp, arg);
+}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseAND, regOp, arg);
+}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseANDN, regOp, arg);
+}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseANDN, regOp, arg);
+}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseOR, regOp, arg);
+}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseOR, regOp, arg);
+}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseXOR, regOp, arg);
+}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseXOR, regOp, arg);
+}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMUL, regOp, arg);
+}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMUL, regOp, arg);
+}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseDIV, regOp, arg);
+}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseDIV, regOp, arg);
+}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMIN, regOp, arg);
+}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMIN, regOp, arg);
+}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMAX, regOp, arg);
+}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRSQRT, regOp, arg);
+}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x00, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseCOMIS, regOp, arg);
+}  // weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseCOMIS, regOp, arg);
+}  // ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseUCOMIS, regOp, arg);
+}  // unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseUCOMIS, regOp, arg);
+}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);
+}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);
+}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);
+}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg);
+}
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg);
+}
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));
+}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));
+}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5A, regOp, arg);
+}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5A, regOp, arg);
+}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2A, regOp, arg);
+}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2A, regOp, arg);
+}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0xE6, regOp, arg);
+}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5B, regOp, arg);
+}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0xE6, regOp, arg);
+}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5B, regOp, arg);
+}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5B, regOp, arg);
+}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE6, regOp, arg);
+}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)
+{
+  WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));
+}
+
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x50, dest, arg);
+}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x50, dest, arg);
+}
+
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseLDDQU, dest, arg);
+}  // For integer data only
+
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x15, dest, arg);
+}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x15, dest, arg);
+}
+
+// Pretty much every x86 CPU nowadays supports SSE3,
+// but the SSE2 fallbacks are easy.
+void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKLPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x16, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKHPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF2, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVSD(regOp, arg);
+    UNPCKLPD(regOp, R(regOp));
+  }
+}
+
+// There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6B, dest, arg);
+}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x63, dest, arg);
+}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x67, dest, arg);
+}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x60, dest, arg);
+}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x61, dest, arg);
+}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x62, dest, arg);
+}
+void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6C, dest, arg);
+}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAW-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x71);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAD-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x72);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSSE3)
+    PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSE4_1)
+    PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSSE3Op(0x66, 0x3800, dest, arg);
+}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3817, dest, arg);
+}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x382b, dest, arg);
+}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3820, dest, arg);
+}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3821, dest, arg);
+}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3822, dest, arg);
+}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3823, dest, arg);
+}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3824, dest, arg);
+}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3825, dest, arg);
+}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3830, dest, arg);
+}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3831, dest, arg);
+}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3832, dest, arg);
+}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3833, dest, arg);
+}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3834, dest, arg);
+}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3835, dest, arg);
+}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3810, dest, arg);
+}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3814, dest, arg);
+}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3815, dest, arg);
+}
+void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1);
+  Write8(blend);
+}
+void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1);
+  Write8(blend);
+}
+
+void XEmitter::PAND(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDB, dest, arg);
+}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDF, dest, arg);
+}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEF, dest, arg);
+}
+void XEmitter::POR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEB, dest, arg);
+}
+
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFC, dest, arg);
+}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFD, dest, arg);
+}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFE, dest, arg);
+}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD4, dest, arg);
+}
+
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEC, dest, arg);
+}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xED, dest, arg);
+}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDC, dest, arg);
+}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDD, dest, arg);
+}
+
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF8, dest, arg);
+}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF9, dest, arg);
+}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFA, dest, arg);
+}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFB, dest, arg);
+}
+
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE8, dest, arg);
+}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE9, dest, arg);
+}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD8, dest, arg);
+}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD9, dest, arg);
+}
+
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE0, dest, arg);
+}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE3, dest, arg);
+}
+
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x74, dest, arg);
+}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x75, dest, arg);
+}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x76, dest, arg);
+}
+
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x64, dest, arg);
+}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x65, dest, arg);
+}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x66, dest, arg);
+}
+
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC5, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC4, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSE41Op(0x66, 0x3A22, dest, arg);
+  Write8(subreg);
+}
+
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF5, dest, arg);
+}
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF6, dest, arg);
+}
+
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEE, dest, arg);
+}
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDE, dest, arg);
+}
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEA, dest, arg);
+}
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDA, dest, arg);
+}
+
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD7, dest, arg);
+}
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF2, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF3, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+// VEX
+void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);
+}
+void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare)
+{
+  WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1);
+  Write8(compare);
+}
+void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);
+}
+void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3)
+{
+  WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3);
+}
+void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);
+}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);
+}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);
+}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);
+}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
+}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);
+}
+
+#define FMA4(name, op)                                                                             \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);                                                 \
+  }                                                                                                \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);                                                 \
+  }
+
+FMA4(VFMADDSUBPS, 0x5C)
+FMA4(VFMADDSUBPD, 0x5D)
+FMA4(VFMSUBADDPS, 0x5E)
+FMA4(VFMSUBADDPD, 0x5F)
+FMA4(VFMADDPS, 0x68)
+FMA4(VFMADDPD, 0x69)
+FMA4(VFMADDSS, 0x6A)
+FMA4(VFMADDSD, 0x6B)
+FMA4(VFMSUBPS, 0x6C)
+FMA4(VFMSUBPD, 0x6D)
+FMA4(VFMSUBSS, 0x6E)
+FMA4(VFMSUBSD, 0x6F)
+FMA4(VFNMADDPS, 0x78)
+FMA4(VFNMADDPD, 0x79)
+FMA4(VFNMADDSS, 0x7A)
+FMA4(VFNMADDSD, 0x7B)
+FMA4(VFNMSUBPS, 0x7C)
+FMA4(VFNMSUBPD, 0x7D)
+FMA4(VFNMSUBSS, 0x7E)
+FMA4(VFNMSUBSD, 0x7F)
+#undef FMA4
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate)
+{
+  WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1);
+  Write8(rotate);
+}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);
+}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  CheckFlags();
+  WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);
+}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);
+}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);
+}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);
+}
+
+// Prefixes
+
+void XEmitter::LOCK()
+{
+  Write8(0xF0);
+}
+void XEmitter::REP()
+{
+  Write8(0xF3);
+}
+void XEmitter::REPNE()
+{
+  Write8(0xF2);
+}
+void XEmitter::FSOverride()
+{
+  Write8(0x64);
+}
+void XEmitter::GSOverride()
+{
+  Write8(0x65);
+}
+
+void XEmitter::FWAIT()
+{
+  Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
+{
+  int mf = 0;
+  ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid),
+             "WriteFloatLoadStore: 80 bits not supported for this instruction");
+  switch (bits)
+  {
+  case 32:
+    mf = 0;
+    break;
+  case 64:
+    mf = 4;
+    break;
+  case 80:
+    mf = 2;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+  }
+  Write8(0xd9 | mf);
+  // x87 instructions use the reg field of the ModR/M byte as opcode:
+  if (bits == 80)
+    op = op_80b;
+  arg.WriteRest(this, 0, static_cast<X64Reg>(op));
+}
+
+void XEmitter::FLD(int bits, const OpArg& src)
+{
+  WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src);
+}
+void XEmitter::FST(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest);
+}
+void XEmitter::FSTP(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest);
+}
+void XEmitter::FNSTSW_AX()
+{
+  Write8(0xDF);
+  Write8(0xE0);
+}
+
+void XEmitter::RDTSC()
+{
+  Write8(0x0F);
+  Write8(0x31);
+}
+}
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
new file mode 100644
index 0000000..122850d
--- /dev/null
+++ b/src/dolphin/x64Emitter.h
@@ -0,0 +1,1180 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <type_traits>
+
+#include "Assert.h"
+#include "BitSet.h"
+#include "CodeBlock.h"
+#include "../types.h"
+#include "x64ABI.h"
+
+namespace Gen
+{
+enum CCFlags
+{
+  CC_O = 0,
+  CC_NO = 1,
+  CC_B = 2,
+  CC_C = 2,
+  CC_NAE = 2,
+  CC_NB = 3,
+  CC_NC = 3,
+  CC_AE = 3,
+  CC_Z = 4,
+  CC_E = 4,
+  CC_NZ = 5,
+  CC_NE = 5,
+  CC_BE = 6,
+  CC_NA = 6,
+  CC_NBE = 7,
+  CC_A = 7,
+  CC_S = 8,
+  CC_NS = 9,
+  CC_P = 0xA,
+  CC_PE = 0xA,
+  CC_NP = 0xB,
+  CC_PO = 0xB,
+  CC_L = 0xC,
+  CC_NGE = 0xC,
+  CC_NL = 0xD,
+  CC_GE = 0xD,
+  CC_LE = 0xE,
+  CC_NG = 0xE,
+  CC_NLE = 0xF,
+  CC_G = 0xF
+};
+
+enum
+{
+  NUMGPRs = 16,
+  NUMXMMs = 16,
+};
+
+enum
+{
+  SCALE_NONE = 0,
+  SCALE_1 = 1,
+  SCALE_2 = 2,
+  SCALE_4 = 4,
+  SCALE_8 = 8,
+  SCALE_ATREG = 16,
+  // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+  SCALE_NOBASE_2 = 34,
+  SCALE_NOBASE_4 = 36,
+  SCALE_NOBASE_8 = 40,
+  SCALE_RIP = 0xFF,
+  SCALE_IMM8 = 0xF0,
+  SCALE_IMM16 = 0xF1,
+  SCALE_IMM32 = 0xF2,
+  SCALE_IMM64 = 0xF3,
+};
+
+enum SSECompare
+{
+  CMP_EQ = 0,
+  CMP_LT = 1,
+  CMP_LE = 2,
+  CMP_UNORD = 3,
+  CMP_NEQ = 4,
+  CMP_NLT = 5,
+  CMP_NLE = 6,
+  CMP_ORD = 7,
+};
+
+class XEmitter;
+enum class FloatOp;
+enum class NormalOp;
+
+// Information about a generated MOV op
+struct MovInfo final
+{
+  u8* address;
+  bool nonAtomicSwapStore;
+  // valid iff nonAtomicSwapStore is true
+  X64Reg nonAtomicSwapStoreSrc;
+};
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+  // For accessing offset and operandReg.
+  // This also allows us to keep the op writing functions private.
+  friend class XEmitter;
+
+  // dummy op arg, used for storage
+  constexpr OpArg() = default;
+  constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX)
+      : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)},
+        indexReg{static_cast<u16>(scaled_reg)}, offset{offset_}
+  {
+  }
+  constexpr bool operator==(const OpArg& b) const
+  {
+    // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately,
+    // (because we still support some older versions of GCC where std::tie is not constexpr.)
+    return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+           indexReg == b.indexReg && offset == b.offset;
+  }
+  constexpr bool operator!=(const OpArg& b) const { return !operator==(b); }
+  u64 Imm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (u64)offset;
+  }
+  u32 Imm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (u32)offset;
+  }
+  u16 Imm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (u16)offset;
+  }
+  u8 Imm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (u8)offset;
+  }
+
+  s64 SImm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (s64)offset;
+  }
+  s32 SImm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (s32)offset;
+  }
+  s16 SImm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (s16)offset;
+  }
+  s8 SImm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (s8)offset;
+  }
+
+  OpArg AsImm64() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u64)offset, SCALE_IMM64);
+  }
+  OpArg AsImm32() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u32)offset, SCALE_IMM32);
+  }
+  OpArg AsImm16() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u16)offset, SCALE_IMM16);
+  }
+  OpArg AsImm8() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u8)offset, SCALE_IMM8);
+  }
+
+  constexpr bool IsImm() const
+  {
+    return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+           scale == SCALE_IMM64;
+  }
+  constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+  constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; }
+  constexpr bool IsZero() const { return IsImm() && offset == 0; }
+  constexpr int GetImmBits() const
+  {
+    switch (scale)
+    {
+    case SCALE_IMM8:
+      return 8;
+    case SCALE_IMM16:
+      return 16;
+    case SCALE_IMM32:
+      return 32;
+    case SCALE_IMM64:
+      return 64;
+    default:
+      return -1;
+    }
+  }
+
+  constexpr X64Reg GetSimpleReg() const
+  {
+    if (scale == SCALE_NONE)
+      return static_cast<X64Reg>(offsetOrBaseReg);
+
+    return INVALID_REG;
+  }
+
+  void AddMemOffset(int val)
+  {
+    DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE),
+                     "Tried to increment an OpArg which doesn't have an offset");
+    offset += val;
+  }
+
+private:
+  void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+  void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                int W = 0) const;
+  void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+                 bool warn_64bit_offset = true) const;
+  void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+  void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const;
+
+  u8 scale = 0;
+  u16 offsetOrBaseReg = 0;
+  u16 indexReg = 0;
+  u64 offset = 0;  // Also used to store immediates.
+  u16 operandReg = 0;
+};
+
+template <typename T>
+inline OpArg M(const T* ptr)
+{
+  return OpArg((u64)(const void*)ptr, (int)SCALE_RIP);
+}
+constexpr OpArg R(X64Reg value)
+{
+  return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value)
+{
+  return OpArg(0, SCALE_ATREG, value);
+}
+
+constexpr OpArg MDisp(X64Reg value, int offset)
+{
+  return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
+}
+
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+  return OpArg(offset, scale, base, scaled);
+}
+
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+  if (scale == SCALE_1)
+    return OpArg(offset, SCALE_ATREG, scaled);
+
+  return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+  return MComplex(base, offset, 1, 0);
+}
+
+constexpr OpArg Imm8(u8 imm)
+{
+  return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm)
+{
+  return OpArg(imm, SCALE_IMM16);
+}  // rarely used
+constexpr OpArg Imm32(u32 imm)
+{
+  return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm)
+{
+  return OpArg(imm, SCALE_IMM64);
+}
+inline OpArg ImmPtr(const void* imm)
+{
+  return Imm64(reinterpret_cast<u64>(imm));
+}
+
+inline u32 PtrOffset(const void* ptr, const void* base = nullptr)
+{
+  s64 distance = (s64)ptr - (s64)base;
+  if (distance >= 0x80000000LL || distance < -0x80000000LL)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range");
+    return 0;
+  }
+
+  return (u32)distance;
+}
+
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
+
+struct FixupBranch
+{
+  enum class Type
+  {
+    Branch8Bit,
+    Branch32Bit
+  };
+
+  u8* ptr;
+  Type type;
+};
+
+class XEmitter
+{
+  friend struct OpArg;  // for Write8 etc
+private:
+  u8* code = nullptr;
+  bool flags_locked = false;
+
+  void CheckFlags();
+
+  void Rex(int w, int r, int x, int b);
+  void WriteModRM(int mod, int reg, int rm);
+  void WriteSIB(int scale, int index, int base);
+  void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+  void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+  void WriteMulDivType(int bits, OpArg src, int ext);
+  void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+  void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+  void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
+  void WriteMXCSR(OpArg arg, int ext);
+  void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+  void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                  int extrabytes = 0);
+  void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg);
+  void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+  void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+
+  void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                              size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+  void Write8(u8 value);
+  void Write16(u16 value);
+  void Write32(u32 value);
+  void Write64(u64 value);
+
+public:
+  XEmitter() = default;
+  explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
+  virtual ~XEmitter() = default;
+  void SetCodePtr(u8* ptr);
+  void ReserveCodeSpace(int bytes);
+  u8* AlignCodeTo(size_t alignment);
+  u8* AlignCode4();
+  u8* AlignCode16();
+  u8* AlignCodePage();
+  const u8* GetCodePtr() const;
+  u8* GetWritableCodePtr();
+
+  void LockFlags() { flags_locked = true; }
+  void UnlockFlags() { flags_locked = false; }
+  // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+  // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+  // string instr.,
+  // INC and DEC are slow on Intel Core, but not on AMD. They create a
+  // false flag dependency because they only update a subset of the flags.
+  // XCHG is SLOW and should be avoided.
+
+  // Debug breakpoint
+  void INT3();
+
+  // Do nothing
+  void NOP(size_t count = 1);
+
+  // Save energy in wait-loops on P4 only. Probably not too useful.
+  void PAUSE();
+
+  // Flag control
+  void STC();
+  void CLC();
+  void CMC();
+
+  // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+  // AMD!
+  void LAHF();  // 3 cycle vector path
+  void SAHF();  // direct path fast
+
+  // Stack control
+  void PUSH(X64Reg reg);
+  void POP(X64Reg reg);
+  void PUSH(int bits, const OpArg& reg);
+  void POP(int bits, const OpArg& reg);
+  void PUSHF();
+  void POPF();
+
+  // Flow control
+  void RET();
+  void RET_FAST();
+  void UD2();
+  FixupBranch J(bool force5bytes = false);
+
+  void JMP(const u8* addr, bool force5Bytes = false);
+  void JMPptr(const OpArg& arg);
+  void JMPself();  // infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+  void CALL(const void* fnptr);
+  FixupBranch CALL();
+  void CALLptr(OpArg arg);
+
+  FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+  void J_CC(CCFlags conditionCode, const u8* addr);
+
+  void SetJumpTarget(const FixupBranch& branch);
+
+  void SETcc(CCFlags flag, OpArg dest);
+  // Note: CMOV brings small if any benefit on current CPUs.
+  void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+  // Fences
+  void LFENCE();
+  void MFENCE();
+  void SFENCE();
+
+  // Bit scan
+  void BSF(int bits, X64Reg dest, const OpArg& src);  // Bottom bit to top bit
+  void BSR(int bits, X64Reg dest, const OpArg& src);  // Top bit to bottom bit
+
+  // Cache control
+  enum PrefetchLevel
+  {
+    PF_NTA,  // Non-temporal (data used once and only once)
+    PF_T0,   // All cache levels
+    PF_T1,   // Levels 2+ (aliased to T0 on AMD)
+    PF_T2,   // Levels 3+ (aliased to T0 on AMD)
+  };
+  void PREFETCH(PrefetchLevel level, OpArg arg);
+  void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+  void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+  void MOVNTPS(const OpArg& arg, X64Reg regOp);
+  void MOVNTPD(const OpArg& arg, X64Reg regOp);
+
+  // Multiplication / division
+  void MUL(int bits, const OpArg& src);   // UNSIGNED
+  void IMUL(int bits, const OpArg& src);  // SIGNED
+  void IMUL(int bits, X64Reg regOp, const OpArg& src);
+  void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+  void DIV(int bits, const OpArg& src);
+  void IDIV(int bits, const OpArg& src);
+
+  // Shift
+  void ROL(int bits, const OpArg& dest, const OpArg& shift);
+  void ROR_(int bits, const OpArg& dest, const OpArg& shift);
+  void RCL(int bits, const OpArg& dest, const OpArg& shift);
+  void RCR(int bits, const OpArg& dest, const OpArg& shift);
+  void SHL(int bits, const OpArg& dest, const OpArg& shift);
+  void SHR(int bits, const OpArg& dest, const OpArg& shift);
+  void SAR(int bits, const OpArg& dest, const OpArg& shift);
+
+  // Bit Test
+  void BT(int bits, const OpArg& dest, const OpArg& index);
+  void BTS(int bits, const OpArg& dest, const OpArg& index);
+  void BTR(int bits, const OpArg& dest, const OpArg& index);
+  void BTC(int bits, const OpArg& dest, const OpArg& index);
+
+  // Double-Precision Shift
+  void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+  void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+
+  // Extend EAX into EDX in various ways
+  void CWD(int bits = 16);
+  inline void CDQ() { CWD(32); }
+  inline void CQO() { CWD(64); }
+  void CBW(int bits = 8);
+  inline void CWDE() { CBW(16); }
+  inline void CDQE() { CBW(32); }
+  // Load effective address
+  void LEA(int bits, X64Reg dest, OpArg src);
+
+  // Integer arithmetic
+  void NEG(int bits, const OpArg& src);
+  void ADD(int bits, const OpArg& a1, const OpArg& a2);
+  void ADC(int bits, const OpArg& a1, const OpArg& a2);
+  void SUB(int bits, const OpArg& a1, const OpArg& a2);
+  void SBB(int bits, const OpArg& a1, const OpArg& a2);
+  void AND(int bits, const OpArg& a1, const OpArg& a2);
+  void CMP(int bits, const OpArg& a1, const OpArg& a2);
+
+  // Bit operations
+  void NOT(int bits, const OpArg& src);
+  void OR(int bits, const OpArg& a1, const OpArg& a2);
+  void XOR(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV(int bits, const OpArg& a1, const OpArg& a2);
+  void TEST(int bits, const OpArg& a1, const OpArg& a2);
+
+  void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2);
+
+  // Are these useful at all? Consider removing.
+  void XCHG(int bits, const OpArg& a1, const OpArg& a2);
+  void XCHG_AHAL();
+
+  // Byte swapping (32 and 64-bit only).
+  void BSWAP(int bits, X64Reg reg);
+
+  // Sign/zero extension
+  void MOVSX(int dbits, int sbits, X64Reg dest,
+             OpArg src);  // automatically uses MOVSXD if necessary
+  void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+  // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+  void MOVBE(int bits, X64Reg dest, const OpArg& src);
+  void MOVBE(int bits, const OpArg& dest, X64Reg src);
+  void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false,
+                   MovInfo* info = nullptr);
+  void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr);
+
+  // Available only on AMD >= Phenom or Intel >= Haswell
+  void LZCNT(int bits, X64Reg dest, const OpArg& src);
+  // Note: this one is actually part of BMI1
+  void TZCNT(int bits, X64Reg dest, const OpArg& src);
+
+  // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+  void STMXCSR(const OpArg& memloc);
+  void LDMXCSR(const OpArg& memloc);
+
+  // Prefixes
+  void LOCK();
+  void REP();
+  void REPNE();
+  void FSOverride();
+  void GSOverride();
+
+  // x87
+  enum x87StatusWordBits
+  {
+    x87_InvalidOperation = 0x1,
+    x87_DenormalizedOperand = 0x2,
+    x87_DivisionByZero = 0x4,
+    x87_Overflow = 0x8,
+    x87_Underflow = 0x10,
+    x87_Precision = 0x20,
+    x87_StackFault = 0x40,
+    x87_ErrorSummary = 0x80,
+    x87_C0 = 0x100,
+    x87_C1 = 0x200,
+    x87_C2 = 0x400,
+    x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+    x87_C3 = 0x4000,
+    x87_FPUBusy = 0x8000,
+  };
+
+  void FLD(int bits, const OpArg& src);
+  void FST(int bits, const OpArg& dest);
+  void FSTP(int bits, const OpArg& dest);
+  void FNSTSW_AX();
+  void FWAIT();
+
+  // SSE/SSE2: Floating point arithmetic
+  void ADDSS(X64Reg regOp, const OpArg& arg);
+  void ADDSD(X64Reg regOp, const OpArg& arg);
+  void SUBSS(X64Reg regOp, const OpArg& arg);
+  void SUBSD(X64Reg regOp, const OpArg& arg);
+  void MULSS(X64Reg regOp, const OpArg& arg);
+  void MULSD(X64Reg regOp, const OpArg& arg);
+  void DIVSS(X64Reg regOp, const OpArg& arg);
+  void DIVSD(X64Reg regOp, const OpArg& arg);
+  void MINSS(X64Reg regOp, const OpArg& arg);
+  void MINSD(X64Reg regOp, const OpArg& arg);
+  void MAXSS(X64Reg regOp, const OpArg& arg);
+  void MAXSD(X64Reg regOp, const OpArg& arg);
+  void SQRTSS(X64Reg regOp, const OpArg& arg);
+  void SQRTSD(X64Reg regOp, const OpArg& arg);
+  void RCPSS(X64Reg regOp, const OpArg& arg);
+  void RSQRTSS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point bitwise (yes)
+  void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
+
+  // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+  void ADDPS(X64Reg regOp, const OpArg& arg);
+  void ADDPD(X64Reg regOp, const OpArg& arg);
+  void SUBPS(X64Reg regOp, const OpArg& arg);
+  void SUBPD(X64Reg regOp, const OpArg& arg);
+  void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+  void MULPS(X64Reg regOp, const OpArg& arg);
+  void MULPD(X64Reg regOp, const OpArg& arg);
+  void DIVPS(X64Reg regOp, const OpArg& arg);
+  void DIVPD(X64Reg regOp, const OpArg& arg);
+  void MINPS(X64Reg regOp, const OpArg& arg);
+  void MINPD(X64Reg regOp, const OpArg& arg);
+  void MAXPS(X64Reg regOp, const OpArg& arg);
+  void MAXPD(X64Reg regOp, const OpArg& arg);
+  void SQRTPS(X64Reg regOp, const OpArg& arg);
+  void SQRTPD(X64Reg regOp, const OpArg& arg);
+  void RCPPS(X64Reg regOp, const OpArg& arg);
+  void RSQRTPS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+  void ANDPS(X64Reg regOp, const OpArg& arg);
+  void ANDPD(X64Reg regOp, const OpArg& arg);
+  void ANDNPS(X64Reg regOp, const OpArg& arg);
+  void ANDNPD(X64Reg regOp, const OpArg& arg);
+  void ORPS(X64Reg regOp, const OpArg& arg);
+  void ORPD(X64Reg regOp, const OpArg& arg);
+  void XORPS(X64Reg regOp, const OpArg& arg);
+  void XORPD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+  void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+  void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
+
+  // SSE3
+  void MOVSLDUP(X64Reg regOp, const OpArg& arg);
+  void MOVSHDUP(X64Reg regOp, const OpArg& arg);
+  void MOVDDUP(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Useful alternative to shuffle in some cases.
+  void UNPCKLPS(X64Reg dest, const OpArg& src);
+  void UNPCKHPS(X64Reg dest, const OpArg& src);
+  void UNPCKLPD(X64Reg dest, const OpArg& src);
+  void UNPCKHPD(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Compares.
+  void COMISS(X64Reg regOp, const OpArg& arg);
+  void COMISD(X64Reg regOp, const OpArg& arg);
+  void UCOMISS(X64Reg regOp, const OpArg& arg);
+  void UCOMISD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+  void MOVAPS(X64Reg regOp, const OpArg& arg);
+  void MOVAPD(X64Reg regOp, const OpArg& arg);
+  void MOVAPS(const OpArg& arg, X64Reg regOp);
+  void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVUPS(X64Reg regOp, const OpArg& arg);
+  void MOVUPD(X64Reg regOp, const OpArg& arg);
+  void MOVUPS(const OpArg& arg, X64Reg regOp);
+  void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVDQA(X64Reg regOp, const OpArg& arg);
+  void MOVDQA(const OpArg& arg, X64Reg regOp);
+  void MOVDQU(X64Reg regOp, const OpArg& arg);
+  void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+  void MOVSS(X64Reg regOp, const OpArg& arg);
+  void MOVSD(X64Reg regOp, const OpArg& arg);
+  void MOVSS(const OpArg& arg, X64Reg regOp);
+  void MOVSD(const OpArg& arg, X64Reg regOp);
+
+  void MOVLPS(X64Reg regOp, const OpArg& arg);
+  void MOVLPD(X64Reg regOp, const OpArg& arg);
+  void MOVLPS(const OpArg& arg, X64Reg regOp);
+  void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHPS(X64Reg regOp, const OpArg& arg);
+  void MOVHPD(X64Reg regOp, const OpArg& arg);
+  void MOVHPS(const OpArg& arg, X64Reg regOp);
+  void MOVHPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+  void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+  // Be careful when using these overloads for reg <--> xmm moves.
+  // The one you cast to OpArg with R(reg) is the x86 reg, the other
+  // one is the xmm reg.
+  // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx)
+  //     use "MOVD_xmm(R(eax), xmm1)" instead.
+  void MOVD_xmm(X64Reg dest, const OpArg& arg);
+  void MOVQ_xmm(X64Reg dest, OpArg arg);
+  void MOVD_xmm(const OpArg& arg, X64Reg src);
+  void MOVQ_xmm(OpArg arg, X64Reg src);
+
+  // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+  // question.
+  void MOVMSKPS(X64Reg dest, const OpArg& arg);
+  void MOVMSKPD(X64Reg dest, const OpArg& arg);
+
+  // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+  // weird one.
+  void MASKMOVDQU(X64Reg dest, X64Reg src);
+  void LDDQU(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Data type conversions.
+  void CVTPS2PD(X64Reg dest, const OpArg& src);
+  void CVTPD2PS(X64Reg dest, const OpArg& src);
+  void CVTSS2SD(X64Reg dest, const OpArg& src);
+  void CVTSI2SS(X64Reg dest, const OpArg& src);
+  void CVTSD2SS(X64Reg dest, const OpArg& src);
+  void CVTSI2SD(X64Reg dest, const OpArg& src);
+  void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+  void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+  void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+  void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
+
+  // Destinations are X64 regs (rax, rbx, ...) for these instructions.
+  void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+  void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+  void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+  void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
+
+  // SSE2: Packed integer instructions
+  void PACKSSDW(X64Reg dest, const OpArg& arg);
+  void PACKSSWB(X64Reg dest, const OpArg& arg);
+  void PACKUSDW(X64Reg dest, const OpArg& arg);
+  void PACKUSWB(X64Reg dest, const OpArg& arg);
+
+  void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+  void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+  void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+  void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
+
+  void PTEST(X64Reg dest, const OpArg& arg);
+  void PAND(X64Reg dest, const OpArg& arg);
+  void PANDN(X64Reg dest, const OpArg& arg);
+  void PXOR(X64Reg dest, const OpArg& arg);
+  void POR(X64Reg dest, const OpArg& arg);
+
+  void PADDB(X64Reg dest, const OpArg& arg);
+  void PADDW(X64Reg dest, const OpArg& arg);
+  void PADDD(X64Reg dest, const OpArg& arg);
+  void PADDQ(X64Reg dest, const OpArg& arg);
+
+  void PADDSB(X64Reg dest, const OpArg& arg);
+  void PADDSW(X64Reg dest, const OpArg& arg);
+  void PADDUSB(X64Reg dest, const OpArg& arg);
+  void PADDUSW(X64Reg dest, const OpArg& arg);
+
+  void PSUBB(X64Reg dest, const OpArg& arg);
+  void PSUBW(X64Reg dest, const OpArg& arg);
+  void PSUBD(X64Reg dest, const OpArg& arg);
+  void PSUBQ(X64Reg dest, const OpArg& arg);
+
+  void PSUBSB(X64Reg dest, const OpArg& arg);
+  void PSUBSW(X64Reg dest, const OpArg& arg);
+  void PSUBUSB(X64Reg dest, const OpArg& arg);
+  void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+  void PAVGB(X64Reg dest, const OpArg& arg);
+  void PAVGW(X64Reg dest, const OpArg& arg);
+
+  void PCMPEQB(X64Reg dest, const OpArg& arg);
+  void PCMPEQW(X64Reg dest, const OpArg& arg);
+  void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+  void PCMPGTB(X64Reg dest, const OpArg& arg);
+  void PCMPGTW(X64Reg dest, const OpArg& arg);
+  void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+  void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
+
+  void PMADDWD(X64Reg dest, const OpArg& arg);
+  void PSADBW(X64Reg dest, const OpArg& arg);
+
+  void PMAXSW(X64Reg dest, const OpArg& arg);
+  void PMAXUB(X64Reg dest, const OpArg& arg);
+  void PMINSW(X64Reg dest, const OpArg& arg);
+  void PMINUB(X64Reg dest, const OpArg& arg);
+
+  void PMOVMSKB(X64Reg dest, const OpArg& arg);
+  void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFB(X64Reg dest, const OpArg& arg);
+
+  void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
+
+  void PSRLW(X64Reg reg, int shift);
+  void PSRLD(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, const OpArg& arg);
+  void PSRLDQ(X64Reg reg, int shift);
+
+  void PSLLW(X64Reg reg, int shift);
+  void PSLLD(X64Reg reg, int shift);
+  void PSLLQ(X64Reg reg, int shift);
+  void PSLLDQ(X64Reg reg, int shift);
+
+  void PSRAW(X64Reg reg, int shift);
+  void PSRAD(X64Reg reg, int shift);
+
+  // SSE4: data type conversions
+  void PMOVSXBW(X64Reg dest, const OpArg& arg);
+  void PMOVSXBD(X64Reg dest, const OpArg& arg);
+  void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXWD(X64Reg dest, const OpArg& arg);
+  void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXBW(X64Reg dest, const OpArg& arg);
+  void PMOVZXBD(X64Reg dest, const OpArg& arg);
+  void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXWD(X64Reg dest, const OpArg& arg);
+  void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXDQ(X64Reg dest, const OpArg& arg);
+
+  // SSE4: blend instructions
+  void PBLENDVB(X64Reg dest, const OpArg& arg);
+  void BLENDVPS(X64Reg dest, const OpArg& arg);
+  void BLENDVPD(X64Reg dest, const OpArg& arg);
+  void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
+  void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
+
+  // AVX
+  void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare);
+  void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask);
+  void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+  void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+
+  void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  // FMA3
+  void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+#define FMA4(name)                                                                                 \
+  void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);                          \
+  void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+
+  FMA4(VFMADDSUBPS)
+  FMA4(VFMADDSUBPD)
+  FMA4(VFMSUBADDPS)
+  FMA4(VFMSUBADDPD)
+  FMA4(VFMADDPS)
+  FMA4(VFMADDPD)
+  FMA4(VFMADDSS)
+  FMA4(VFMADDSD)
+  FMA4(VFMSUBPS)
+  FMA4(VFMSUBPD)
+  FMA4(VFMSUBSS)
+  FMA4(VFMSUBSD)
+  FMA4(VFNMADDPS)
+  FMA4(VFNMADDPD)
+  FMA4(VFNMADDSS)
+  FMA4(VFNMADDSD)
+  FMA4(VFNMSUBPS)
+  FMA4(VFNMSUBPD)
+  FMA4(VFNMSUBSS)
+  FMA4(VFNMSUBSD)
+#undef FMA4
+
+  // VEX GPR instructions
+  void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+  void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+  void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void RDTSC();
+
+  // Utility functions
+  // The difference between this and CALL is that this aligns the stack
+  // where appropriate.
+  template <typename FunctionPointer>
+  void ABI_CallFunction(FunctionPointer func)
+  {
+    static_assert(std::is_pointer<FunctionPointer>() &&
+                      std::is_function<std::remove_pointer_t<FunctionPointer>>(),
+                  "Supplied type must be a function pointer.");
+
+    const void* ptr = reinterpret_cast<const void*>(func);
+    const u64 address = reinterpret_cast<u64>(ptr);
+    const u64 distance = address - (reinterpret_cast<u64>(code) + 5);
+
+    if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL)
+    {
+      // Far call
+      MOV(64, R(RAX), Imm64(address));
+      CALLptr(R(RAX));
+    }
+    else
+    {
+      CALL(ptr);
+    }
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC16(FunctionPointer func, u16 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC(FunctionPointer func, u32 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3,
+                            const void* param4)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  // Pass a register as a parameter.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1)
+  {
+    if (reg1 != ABI_PARAM1)
+      MOV(32, R(ABI_PARAM1), R(reg1));
+    ABI_CallFunction(func);
+  }
+
+  // Pass two registers as parameters.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2)
+  {
+    MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2);
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    ABI_CallFunction(func);
+  }
+
+  // Helper method for ABI functions related to calling functions. May be used by itself as well.
+  void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2);
+
+  // Saves/restores the registers and adjusts the stack to be aligned as
+  // required by the ABI, where the previous alignment was as specified.
+  // Push returns the size of the shadow space, i.e. the offset of the frame.
+  size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                         size_t needed_frame_size = 0);
+  void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                      size_t needed_frame_size = 0);
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  template <typename T, typename... Args>
+  void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+  {
+    auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>;
+    ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1);
+  }
+};  // class XEmitter
+
+class X64CodeBlock : public Common::CodeBlock<XEmitter>
+{
+private:
+  void PoisonMemory() override
+  {
+    // x86/64: 0xCC = breakpoint
+    memset(region, 0xCC, region_size);
+  }
+};
+
+}  // namespace
diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h
new file mode 100644
index 0000000..a92e024
--- /dev/null
+++ b/src/dolphin/x64Reg.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+namespace Gen
+{
+enum X64Reg
+{
+  EAX = 0,
+  EBX = 3,
+  ECX = 1,
+  EDX = 2,
+  ESI = 6,
+  EDI = 7,
+  EBP = 5,
+  ESP = 4,
+
+  RAX = 0,
+  RBX = 3,
+  RCX = 1,
+  RDX = 2,
+  RSI = 6,
+  RDI = 7,
+  RBP = 5,
+  RSP = 4,
+  R8 = 8,
+  R9 = 9,
+  R10 = 10,
+  R11 = 11,
+  R12 = 12,
+  R13 = 13,
+  R14 = 14,
+  R15 = 15,
+
+  AL = 0,
+  BL = 3,
+  CL = 1,
+  DL = 2,
+  SIL = 6,
+  DIL = 7,
+  BPL = 5,
+  SPL = 4,
+  AH = 0x104,
+  BH = 0x107,
+  CH = 0x105,
+  DH = 0x106,
+
+  AX = 0,
+  BX = 3,
+  CX = 1,
+  DX = 2,
+  SI = 6,
+  DI = 7,
+  BP = 5,
+  SP = 4,
+
+  XMM0 = 0,
+  XMM1,
+  XMM2,
+  XMM3,
+  XMM4,
+  XMM5,
+  XMM6,
+  XMM7,
+  XMM8,
+  XMM9,
+  XMM10,
+  XMM11,
+  XMM12,
+  XMM13,
+  XMM14,
+  XMM15,
+
+  YMM0 = 0,
+  YMM1,
+  YMM2,
+  YMM3,
+  YMM4,
+  YMM5,
+  YMM6,
+  YMM7,
+  YMM8,
+  YMM9,
+  YMM10,
+  YMM11,
+  YMM12,
+  YMM13,
+  YMM14,
+  YMM15,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+}  // namespace Gen
-- 
cgit v1.2.3


From 2f6b46fd4f4eb593746391131e2523f5252d0ea4 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 25 Jun 2019 17:09:27 +0200
Subject: JIT: implemented most ALU instructions

---
 src/ARM.cpp                        |  18 +-
 src/ARMJIT.cpp                     |  16 +-
 src/ARMJIT.h                       |  25 +-
 src/ARMJIT_RegCache.h              | 136 +++++++++
 src/ARMJIT_x64/ARMJIT_ALU.cpp      | 546 +++++++++++++++++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 245 ++++++++---------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  60 +++-
 src/CMakeLists.txt                 |   1 +
 8 files changed, 881 insertions(+), 166 deletions(-)
 create mode 100644 src/ARMJIT_RegCache.h
 create mode 100644 src/ARMJIT_x64/ARMJIT_ALU.cpp

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index f2b92b4..eadedc7 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -560,10 +560,10 @@ void ARMv5::Execute()
                 AddCycles_C();
         }*/
 
-        if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);
+        /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);*/
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4));
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
             block = ARMJIT::CompileBlock(this);
         Cycles += block();
@@ -615,7 +615,7 @@ void ARMv4::Execute()
 
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
-        if (CPSR & 0x20) // THUMB
+        /*if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -643,7 +643,15 @@ void ARMv4::Execute()
             }
             else
                 AddCycles_C();
-        }
+        }*/
+
+        /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4)))
+            printf("aaarg ungempappter raum %x\n", R[15]);*/
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
+        if (block == NULL)
+            block = ARMJIT::CompileBlock(this);
+        Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 489cdcf..74e154b 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,5 +1,7 @@
 #include "ARMJIT.h"
 
+#include <string.h>
+
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
 namespace ARMJIT
@@ -8,7 +10,6 @@ namespace ARMJIT
 Compiler* compiler;
 BlockCache cache;
 
-
 #define DUP2(x) x, x
 
 static ptrdiff_t JIT_MEM[2][32] = {
@@ -174,4 +175,17 @@ CompiledBlock CompileBlock(ARM* cpu)
     return block;
 }
 
+void ResetBlocks()
+{
+	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
+	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
+	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
+	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
+	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
+	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
+	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
+	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
+	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index d718295..2ca29e8 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -3,8 +3,6 @@
 
 #include "types.h"
 
-#include <string.h>
-
 #include "ARM.h"
 #include "ARM_InstrInfo.h"
 
@@ -13,14 +11,6 @@ namespace ARMJIT
 
 typedef u32 (*CompiledBlock)();
 
-class RegCache
-{
-
-static const int NativeRegAllocOrder[];
-static const int NativeRegsCount;
-
-};
-
 struct FetchedInstr
 {
     u32 A_Reg(int pos) const
@@ -117,24 +107,13 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
 }
 
-inline void ResetBlocks()
-{
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
-}
-
 void Init();
 void DeInit();
 
 CompiledBlock CompileBlock(ARM* cpu);
 
+void ResetBlocks();
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
new file mode 100644
index 0000000..e18d50f
--- /dev/null
+++ b/src/ARMJIT_RegCache.h
@@ -0,0 +1,136 @@
+#ifndef ARMJIT_REGCACHE_H
+#define ARMJIT_REGCACHE_H
+
+#include "ARMJIT.h"
+
+// TODO: replace this in the future
+#include "dolphin/BitSet.h"
+
+#include <assert.h>
+
+namespace ARMJIT
+{
+
+template <typename T, typename Reg>
+class RegCache
+{
+public:
+    RegCache()
+    {}
+
+	RegCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
+    {
+        for (int i = 0; i < 16; i++)
+            Mapping[i] = (Reg)-1;
+    }
+
+    void UnloadRegister(int reg)
+    {
+        assert(Mapping[reg] != -1);
+
+        if (DirtyRegs & (1 << reg))
+            Compiler->UnloadReg(reg, Mapping[reg]);
+
+        DirtyRegs &= ~(1 << reg);
+        LoadedRegs &= ~(1 << reg);
+        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
+        Mapping[reg] = (Reg)-1;
+    }
+
+    void LoadRegister(int reg)
+    {
+        assert(Mapping[reg] == -1);
+        for (int i = 0; i < NativeRegsAvailable; i++)
+        {
+            Reg nativeReg = NativeRegAllocOrder[i];
+            if (!(NativeRegsUsed & (1 << nativeReg)))
+            {
+                Mapping[reg] = nativeReg;
+                NativeRegsUsed |= 1 << (int)nativeReg;
+                LoadedRegs |= 1 << reg;
+
+                Compiler->LoadReg(reg, nativeReg);
+
+                return;
+            }
+        }
+
+        assert("Welp!");
+    }
+
+    void Flush()
+    {
+        BitSet16 loadedSet(LoadedRegs);
+        for (int reg : loadedSet)
+            UnloadRegister(reg);
+    }
+
+	void Prepare(int i)
+    {
+        u16 futureNeeded = 0;
+        int ranking[16];
+        for (int j = 0; j < 16; j++)
+            ranking[j] = 0;
+        for (int j = i; j < InstrsCount; j++)
+        {
+            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
+            futureNeeded |= regsNeeded.m_val;
+            for (int reg : regsNeeded)
+                ranking[reg]++;
+        }
+
+        // we'll unload all registers which are never used again
+        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
+        for (int reg : neverNeededAgain)
+            UnloadRegister(reg);
+
+		FetchedInstr Instr = Instrs[i];
+        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
+        if (needToBeLoaded != BitSet16(0))
+        {
+            int neededCount = needToBeLoaded.Count();
+            BitSet16 loadedSet(LoadedRegs);
+            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
+            {
+                int leastReg = -1;
+                int rank = 1000;
+                for (int reg : loadedSet)
+                {
+                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
+                    {
+                        leastReg = reg;
+                        rank = ranking[reg];
+                    }
+                }
+
+                assert(leastReg != -1);
+                UnloadRegister(leastReg);
+
+                loadedSet.m_val = LoadedRegs;
+            }
+
+            for (int reg : needToBeLoaded)
+                LoadRegister(reg);
+        }
+        DirtyRegs |= Instr.Info.DstRegs;
+    }
+
+	static const Reg NativeRegAllocOrder[];
+	static const int NativeRegsAvailable;
+
+	Reg Mapping[16];
+	u32 NativeRegsUsed = 0;
+	u16 LoadedRegs = 0;
+	u16 DirtyRegs = 0;
+
+	T* Compiler;
+
+	FetchedInstr* Instrs;
+	int InstrsCount;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..d06c99c
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -0,0 +1,546 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+// uses RSCRATCH3
+void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&),
+    OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (rd == rn && !(opFlags & opInvertOp2))
+        (this->*op)(32, rd, op2);
+    else if (opFlags & opSymmetric && op2 == R(RSCRATCH))
+    {
+        if (opFlags & opInvertOp2)
+            NOT(32, op2);
+        (this->*op)(32, op2, rn);
+        MOV(32, rd, op2);
+    }
+    else
+    {
+        if (opFlags & opInvertOp2)
+        {
+            if (op2 != R(RSCRATCH))
+            {
+                MOV(32, R(RSCRATCH), op2);
+                op2 = R(RSCRATCH);
+            }
+            NOT(32, op2);
+        }
+        MOV(32, R(RSCRATCH3), rn);
+        (this->*op)(32, R(RSCRATCH3), op2);
+        MOV(32, rd, R(RSCRATCH3));
+    }
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (op2 != R(RSCRATCH))
+    {
+        MOV(32, R(RSCRATCH), op2);
+        op2 = R(RSCRATCH);
+    }
+    (this->*op)(32, op2, rn);
+    MOV(32, rd, op2);
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
+{
+    switch (op)
+    {
+        case 0: // TST
+            if (rn.IsImm())
+            {
+                MOV(32, R(RSCRATCH3), rn);
+                rn = R(RSCRATCH3);
+            }
+            TEST(32, rn, op2);
+        break;
+        case 1: // TEQ
+            MOV(32, R(RSCRATCH3), rn);
+            XOR(32, R(RSCRATCH3), op2);
+        break;
+        case 2: // CMP
+            if (rn.IsImm())
+            {
+                MOV(32, R(RSCRATCH3), rn);
+                rn = R(RSCRATCH3);
+            }
+            CMP(32, rn, op2);
+        break;
+        case 3: // CMN
+            MOV(32, R(RSCRATCH3), rn);
+            ADD(32, R(RSCRATCH3), op2);
+        break;
+    }
+
+    Comp_RetriveFlags(op == 2, op >= 2, carryUsed);
+}
+
+// also calculates cycles
+OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
+{
+    if (CurrentInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        carryUsed = false;
+        return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E));
+    }
+    else
+    {
+        int op = (CurrentInstr.Instr >> 5) & 0x3;
+        if (CurrentInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+            OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+            if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15)
+                rm = Imm32(rm.Imm32() + 4);
+            return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+            return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F,
+                    MapReg(CurrentInstr.A_Reg(0)), S, carryUsed);
+        }
+    }
+}
+
+void Compiler::A_Comp_CmpOp()
+{
+    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed);
+
+    Comp_CmpOp(op - 0x8, rn, op2, carryUsed);
+}
+
+void Compiler::A_Comp_Arith()
+{
+    bool S = CurrentInstr.Instr & (1 << 20);
+    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed);
+
+    u32 sFlag = S ? opSetsFlags : 0;
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0x1: // EOR
+        Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0x2: // SUB
+        Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        return;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            if (rd != rn)
+                MOV(32, rd, rn);
+            NEG(32, rd);
+            if (S)
+                Comp_RetriveFlags(true, true, false);
+        }
+        else
+            Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        return;
+    case 0x4: // ADD
+        Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
+        return;
+    case 0x5: // ADC
+        Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
+        return;
+    case 0x6: // SBC
+        Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        return;
+    case 0x7: // RSC
+        Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
+        return;
+    case 0xC: // ORR
+        Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        return;
+    case 0xE: // BIC
+        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
+        return;
+    default:
+        assert("unimplemented");
+    }
+}
+
+void Compiler::A_Comp_MovOp()
+{
+    bool carryUsed;
+    bool S = CurrentInstr.Instr & (1 << 20);
+    OpArg op2 = A_Comp_GetALUOp2(S, carryUsed);
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+
+    if (rd != op2)
+        MOV(32, rd, op2);
+
+    if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF)
+        NOT(32, rd);
+
+    if (S)
+    {
+        TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, carryUsed);
+    }
+}
+
+void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
+{
+    CPSRDirty = true;
+
+    bool carryOnly = !retriveCV && carryUsed;
+    if (retriveCV)
+    {
+        SETcc(CC_O, R(RSCRATCH));
+        SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3));
+        LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
+    }
+
+    if (carryUsed == 983298)
+        printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr);
+
+    SETcc(CC_S, R(RSCRATCH));
+    SETcc(CC_Z, R(RSCRATCH3));
+    LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
+    int shiftAmount = 30;
+    if (retriveCV || carryUsed)
+    {
+        LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
+        shiftAmount = carryOnly ? 29 : 28;
+    }
+    SHL(32, R(RSCRATCH), Imm8(shiftAmount));
+
+    AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+    OR(32, R(RCPSR), R(RSCRATCH));
+}
+
+// always uses RSCRATCH, RSCRATCH2 only if S == true
+OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = S;
+
+    if (S)
+    {
+        XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+        BT(32, R(RCPSR), Imm8(29));
+        SETcc(CC_C, R(RSCRATCH2));
+    }
+
+    MOV(32, R(RSCRATCH), rm);
+    static_assert(RSCRATCH3 == ECX);
+    MOV(32, R(ECX), rs);
+    AND(32, R(ECX), Imm32(0xFF));
+
+    FixupBranch zero = J_CC(CC_Z);
+    if (op < 3)
+    {
+        void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL;
+        if (op == 0)
+            shiftOp = SHL;
+        else if (op == 1)
+            shiftOp = SHR;
+        else if (op == 2)
+            shiftOp = SAR;
+
+        CMP(32, R(ECX), Imm8(32));
+        FixupBranch lt32 = J_CC(CC_L);
+        FixupBranch done1;
+        if (op < 2)
+        {
+            FixupBranch eq32 = J_CC(CC_E);
+            XOR(32, R(RSCRATCH), R(RSCRATCH));
+            if (S)
+                XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+            done1 = J();
+            SetJumpTarget(eq32);
+        }
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(31));
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(1));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        FixupBranch done2 = J();
+
+        SetJumpTarget(lt32);
+        (this->*shiftOp)(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        if (op < 2)
+            SetJumpTarget(done1);
+        SetJumpTarget(done2);
+
+    }
+    else if (op == 3)
+    {
+        if (S)
+            BT(32, R(RSCRATCH), Imm8(31));
+        ROR_(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+    }
+    SetJumpTarget(zero);
+
+    return R(RSCRATCH);
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+        case 0: // LSL
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), rm);
+                SHL(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+
+                return R(RSCRATCH);
+            }
+            else
+            {
+                carryUsed = false;
+                return rm;
+            }
+        case 1: // LSR
+            if (amount > 0)
+            {
+                MOV(32, R(RSCRATCH), rm);
+                SHR(32, R(RSCRATCH), Imm8(amount));
+                if (S)
+                    SETcc(CC_C, R(RSCRATCH2));
+                return R(RSCRATCH);
+            }
+            else
+            {
+                if (S)
+                {
+                    MOV(32, R(RSCRATCH2), rm);
+                    SHR(32, R(RSCRATCH2), Imm8(31));
+                }
+                return Imm32(0);
+            }
+        case 2: // ASR
+            MOV(32, R(RSCRATCH), rm);
+            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            if (S)
+            {
+                if (amount == 0)
+                    BT(32, rm, Imm8(31));
+                SETcc(CC_C, R(RSCRATCH2));
+            }
+            return R(RSCRATCH);
+        case 3: // ROR
+            MOV(32, R(RSCRATCH), rm);
+            if (amount > 0)
+                ROR_(32, R(RSCRATCH), Imm8(amount));
+            else
+            {
+                BT(32, R(RCPSR), Imm8(29));
+                RCR(32, R(RSCRATCH), Imm8(1));
+            }
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+    }
+
+    assert(false);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    int op = (CurrentInstr.Instr >> 11) & 0x3;
+    int amount = (CurrentInstr.Instr >> 6) & 0x1F;
+
+    Comp_AddCycles_C();
+
+    bool carryUsed;
+    OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed);
+
+    if (shifted != rd)
+        MOV(32, rd, shifted);
+
+    TEST(32, rd, rd);
+    Comp_RetriveFlags(false, false, carryUsed);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    int op = (CurrentInstr.Instr >> 9) & 0x3;
+
+    OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6));
+    
+    Comp_AddCycles_C();
+
+    if (op & 1)
+        Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
+    else
+        Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
+}
+
+void Compiler::T_Comp_ALU_Imm8()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(8));
+
+    u32 op = (CurrentInstr.Instr >> 11) & 0x3;
+    OpArg imm = Imm32(CurrentInstr.Instr & 0xFF);
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+        case 0x0:
+            MOV(32, rd, imm);
+            TEST(32, rd, rd);
+            Comp_RetriveFlags(false, false, false);
+            return;
+        case 0x1:
+            Comp_CmpOp(2, rd, imm, false);
+            return;
+        case 0x2:
+            Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+            return;
+        case 0x3:
+            Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+            return;
+    }
+}
+
+void Compiler::T_Comp_ALU()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+
+    u32 op = (CurrentInstr.Instr >> 6) & 0xF;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x1: // EOR
+        Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {
+            int shiftOp = op == 7 ? 3 : op - 0x2;
+            bool carryUsed;
+            OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
+            TEST(32, shifted, shifted);
+            MOV(32, rd, shifted);
+            Comp_RetriveFlags(false, false, true);
+        }
+        return;
+    case 0x5: // ADC
+        Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
+        return;
+    case 0x6: // SBC
+        Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
+        return;
+    case 0x8: // TST
+        Comp_CmpOp(0, rd, rs, false);
+        return;
+    case 0x9: // NEG
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NEG(32, rd);
+        Comp_RetriveFlags(true, true, false);
+        return;
+    case 0xA: // CMP
+        Comp_CmpOp(2, rd, rs, false);
+        return;
+    case 0xB: // CMN
+        Comp_CmpOp(3, rd, rs, false);
+        return;
+    case 0xC: // ORR
+        Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0xE: // BIC
+        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
+        return;
+    case 0xF: // MVN
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NOT(32, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    default:
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8)));
+    OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurrentInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+        case 0x0: // ADD
+            Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV);
+            return;
+        case 0x1: // CMP
+            Comp_CmpOp(2, rd, rs, false);
+            return;
+        case 0x2: // MOV
+            if (rd != rs)
+                MOV(32, rd, rs);
+            TEST(32, rd, rd);
+            Comp_RetriveFlags(false, false, false);
+            return;
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fb2fda8..f51d4d9 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -8,18 +8,16 @@ using namespace Gen;
 
 namespace ARMJIT
 {
-
-const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13};
-const int RegCache::NativeRegsCount = 5;
+template <>
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13};
+template <>
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 5;
 
 Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 4);
 }
 
-typedef void (Compiler::*CompileFunc)();
-typedef void (*InterpretFunc)(ARM*);
-
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -36,6 +34,19 @@ void Compiler::SaveCPSR()
     }
 }
 
+void Compiler::LoadReg(int reg, X64Reg nativeReg)
+{
+    if (reg != 15)
+        MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg])));
+    else
+        MOV(32, R(nativeReg), Imm32(R15));
+}
+
+void Compiler::UnloadReg(int reg, X64Reg nativeReg)
+{
+    MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -58,12 +69,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
+    // TODO: this is ugly as a whole, do better
+    RegCache = ARMJIT::RegCache<Compiler, X64Reg>(this, instrs, instrsCount);
+
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurrentInstr = instrs[i];
 
-        CompileFunc comp = NULL;
+        CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind);
+
+        if (CurrentInstr.Info.Branches())
+            comp = NULL;
 
         if (comp == NULL || i == instrsCount - 1)
         {
@@ -79,6 +96,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             SaveCPSR();
         }
 
+        if (comp != NULL)
+            RegCache.Prepare(i);
+        else
+            RegCache.Flush();
+
         if (Thumb)
         {
             if (comp == NULL)
@@ -89,8 +111,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
             else
-            {
-            }
+                (this->*comp)();
         }
         else
         {
@@ -101,7 +122,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
             }
             else if (cond == 0xF)
-                AddCycles_C();
+                Comp_AddCycles_C();
             else
             {
                 FixupBranch skipExecute;
@@ -115,17 +136,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                         MOV(32, R(RSCRATCH), Imm32(1));
                         SHL(32, R(RSCRATCH), R(RSCRATCH3));
                         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
-                    
+
                         skipExecute = J_CC(CC_Z);
                     }
                     else
                     {
                         // could have used a LUT, but then where would be the fun?
                         BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
-                        
+
                         skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
                     }
-                    
+
                 }
 
                 if (comp == NULL)
@@ -136,8 +157,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
                 }
                 else
-                {
-                }
+                    (this->*comp)();
 
                 FixupBranch skipFailed;
                 if (CurrentInstr.Cond() < 0xE)
@@ -145,7 +165,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     skipFailed = J();
                     SetJumpTarget(skipExecute);
 
-                    AddCycles_C();
+                    Comp_AddCycles_C();
 
                     SetJumpTarget(skipFailed);
                 }
@@ -155,13 +175,14 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         /*
             we don't need to collect the interpreted cycles,
             since all functions only add to it, the dispatcher
-            can take care of it.
+            takes care of it.
         */
 
         if (comp == NULL && i != instrsCount - 1)
             LoadCPSR();
     }
 
+    RegCache.Flush();
     SaveCPSR();
 
     LEA(32, RAX, MDisp(RCycles, ConstantCycles));
@@ -172,42 +193,57 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     return res;
 }
 
-void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
+CompileFunc Compiler::GetCompFunc(int kind)
 {
+    // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
+    // see ARMInstrInfo.h for the order
     const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     {
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // AND
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // EOR
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // SUB
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // RSB
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ADD
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ADC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // SBC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // RSC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // ORR
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // MOV
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        // BIC
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
+        // MVN
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
+        // TST
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // TEQ
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // CMP
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // CMN
+        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -227,21 +263,34 @@ void Compiler::Compile(RegCache& regs, const FetchedInstr& instr)
     };
 
     const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
+        // Shift imm
+        T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
+        // Three operand ADD/SUB
+        T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
+        // 8 bit imm
+        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, 
+        // general ALU
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
+        // hi reg
+        T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
+        // pc/sp relative
+        NULL, NULL, NULL, 
+        // mem...
+        NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL
     };
+
+    return Thumb ? T_Comp[kind] : A_Comp[kind];
 }
 
-void Compiler::AddCycles_C()
+void Compiler::Comp_AddCycles_C()
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
@@ -253,80 +302,16 @@ void Compiler::AddCycles_C()
         ConstantCycles += cycles;
 }
 
-// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
-OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed)
-{
-    carryUsed = true;
-
-    switch (op)
-    {
-        case 0: // LSL
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                SHL(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-
-                return R(RSCRATCH);
-            }
-            else
-            {
-                carryUsed = false;
-                return R(rm);
-            }
-        case 1: // LSR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                SHR(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-                return R(RSCRATCH);
-            }
-            else
-            {
-                if (S)
-                {
-                    MOV(32, R(RSCRATCH2), R(rm));
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                return Imm32(0);
-            }
-        case 2: // ASR
-            MOV(32, R(RSCRATCH), R(rm));
-            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
-            if (S)
-            {
-                if (amount == 0)
-                {
-                    MOV(32, R(RSCRATCH2), R(rm));
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                else
-                    SETcc(CC_C, R(RSCRATCH2));
-            }
-            return R(RSCRATCH);
-        case 3: // ROR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), R(rm));
-                ROR_(32, R(RSCRATCH), Imm8(amount));
-            }
-            else
-            {
-                BT(32, R(RCPSR), Imm8(29));
-                MOV(32, R(RSCRATCH), R(rm));
-                RCR(32, R(RSCRATCH), Imm8(1));
-            }
-            if (S)
-                SETcc(CC_C, R(RSCRATCH2));
-            return R(RSCRATCH);
-    }
-}
-
-void Compiler::A_Comp_ALU(const FetchedInstr& instr)
+void Compiler::Comp_AddCycles_CI(u32 i)
 {
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i;
+    
+    if (CurrentInstr.Cond() < 0xE)
+        ADD(32, R(RCycles), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 8e1d100..9b454f4 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,7 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
-
+#include "../ARMJIT_RegCache.h"
 
 namespace ARMJIT
 {
@@ -17,6 +17,10 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+class Compiler;
+
+typedef void (Compiler::*CompileFunc)();
+
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -24,24 +28,66 @@ public:
 
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
-    void StartBlock(ARM* cpu);
-    CompiledBlock FinaliseBlock();
+    void LoadReg(int reg, Gen::X64Reg nativeReg);
+    void UnloadReg(int reg, Gen::X64Reg nativeReg);
 
-    void Compile(RegCache& regs, const FetchedInstr& instr);
 private:
-    void AddCycles_C();
+    CompileFunc GetCompFunc(int kind);
+
+    void Comp_AddCycles_C();
+    void Comp_AddCycles_CI(u32 i);
+
+    enum
+    {
+        opSetsFlags = 1 << 0,
+        opSymmetric = 1 << 1,
+        opRetriveCV = 1 << 2,
+        opInvertCarry = 1 << 3,
+        opSyncCarry = 1 << 4,
+        opInvertOp2 = 1 << 5,
+    };
+
+    void A_Comp_Arith();
+    void A_Comp_MovOp();
+    void A_Comp_CmpOp();
 
-    Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed);
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALU_Imm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
 
-    void A_Comp_ALU(const FetchedInstr& instr);
+    void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed);
+
+    void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
+
+    Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
+    Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
+
+    Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
 
     void LoadCPSR();
     void SaveCPSR();
 
+    Gen::OpArg MapReg(int reg)
+    {
+        if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
+            return Gen::Imm32(R15);
+
+        assert(RegCache.Mapping[reg] != Gen::INVALID_REG);
+        return Gen::R(RegCache.Mapping[reg]);
+    }
+
     bool CPSRDirty = false;
 
     FetchedInstr CurrentInstr;
 
+    RegCache<Compiler, Gen::X64Reg> RegCache;
+
     bool Thumb;
     u32 Num;
     u32 R15;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a6011e1..0faa57a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,6 +52,7 @@ add_library(core STATIC
 
 	ARMJIT.cpp
 	ARMJIT_x64/ARMJIT_Compiler.cpp
+	ARMJIT_x64/ARMJIT_ALU.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
-- 
cgit v1.2.3


From ea98a44e1e92b0f7622b28d36a1ba6c8d4679a1f Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 25 Jun 2019 18:28:01 +0200
Subject: jit: correct cycle counting for thumb shift by reg

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 7 +++++--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 0
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_LoadStore.cpp

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index d06c99c..dc82af7 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -456,7 +456,10 @@ void Compiler::T_Comp_ALU()
 
     u32 op = (CurrentInstr.Instr >> 6) & 0xF;
 
-    Comp_AddCycles_C();
+    if ((op >= 0x2 && op < 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1);
+    else
+        Comp_AddCycles_C();
 
     switch (op)
     {
@@ -471,7 +474,7 @@ void Compiler::T_Comp_ALU()
     case 0x4:
     case 0x7:
         {
-            int shiftOp = op == 7 ? 3 : op - 0x2;
+            int shiftOp = op == 0x7 ? 3 : op - 0x2;
             bool carryUsed;
             OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
             TEST(32, shifted, shifted);
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..e69de29
-- 
cgit v1.2.3


From 550e6b86d2dc09960c5a74270bc49d3f0e895699 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 30 Jun 2019 13:35:03 +0200
Subject: JIT: compilation of word load and store

---
 src/ARMJIT.cpp                      |   4 +-
 src/ARMJIT.h                        |   3 +-
 src/ARMJIT_RegCache.h               |   2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |   4 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 111 +++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  19 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 600 ++++++++++++++++++++++++++++++++++++
 src/ARM_InstrInfo.h                 |   8 +-
 src/CMakeLists.txt                  |   1 +
 src/dolphin/x64ABI.h                |   3 +-
 10 files changed, 712 insertions(+), 43 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 74e154b..4da781c 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -40,8 +40,7 @@ static ptrdiff_t JIT_MEM[2][32] = {
 		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
 		/* 3X*/	     offsetof(BlockCache, SWRAM),
 		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	     -1,
-		             offsetof(BlockCache, ARM7_WIRAM),
+		/* 4X*/	DUP2(-1),
 		/* 5X*/	DUP2(-1),
 		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
 														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
@@ -183,7 +182,6 @@ void ResetBlocks()
 	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
 	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
 	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM));
 	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
 	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 2ca29e8..45bb4ed 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -63,14 +63,13 @@ struct BlockCache
 {
     CompiledBlock* AddrMapping[2][0x4000] = {0};
 
-    CompiledBlock MainRAM[16*1024*1024/2];
+    CompiledBlock MainRAM[4*1024*1024/2];
 	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
 	CompiledBlock ARM9_ITCM[0x8000/2];
 	CompiledBlock ARM9_LCDC[0xA4000/2];
 	CompiledBlock ARM9_BIOS[0x8000/2];
 	CompiledBlock ARM7_BIOS[0x4000/2];
 	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi
 	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
 };
 
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
index e18d50f..ea9fb30 100644
--- a/src/ARMJIT_RegCache.h
+++ b/src/ARMJIT_RegCache.h
@@ -30,7 +30,7 @@ public:
         assert(Mapping[reg] != -1);
 
         if (DirtyRegs & (1 << reg))
-            Compiler->UnloadReg(reg, Mapping[reg]);
+            Compiler->SaveReg(reg, Mapping[reg]);
 
         DirtyRegs &= ~(1 << reg);
         LoadedRegs &= ~(1 << reg);
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index dc82af7..6294e1d 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -255,8 +255,8 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     if (S)
     {
         XOR(32, R(RSCRATCH2), R(RSCRATCH2));
-        BT(32, R(RCPSR), Imm8(29));
-        SETcc(CC_C, R(RSCRATCH2));
+        TEST(32, R(RCPSR), Imm32(1 << 29));
+        SETcc(CC_NZ, R(RSCRATCH2));
     }
 
     MOV(32, R(RSCRATCH), rm);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index f51d4d9..9096397 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,13 +9,43 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13};
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = 
+{
+#ifdef _WIN32
+    RBX, RSI, RDI, R12, R13
+#else
+    RBX, R12, R13
+#endif
+};
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 5;
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 
+#ifdef _WIN32
+    5
+#else
+    3
+#endif
+;
 
 Compiler::Compiler()
 {
-    AllocCodeSpace(1024 * 1024 * 4);
+    AllocCodeSpace(1024 * 1024 * 16);
+
+    for (int i = 0; i < 15; i++)
+    {
+        ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i);
+        WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i);
+        for (int j = 0; j < 2; j++)
+        {
+            ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i);
+            WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i);
+        }
+    }
+    ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000);
+    WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000);
+    ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000);
+    WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000);
+
+    ResetStart = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -42,7 +72,7 @@ void Compiler::LoadReg(int reg, X64Reg nativeReg)
         MOV(32, R(nativeReg), Imm32(R15));
 }
 
-void Compiler::UnloadReg(int reg, X64Reg nativeReg)
+void Compiler::SaveReg(int reg, X64Reg nativeReg)
 {
     MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
 }
@@ -52,7 +82,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     if (IsAlmostFull())
     {
         ResetBlocks();
-        ResetCodePtr();
+        SetCodePtr((u8*)ResetStart);
     }
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
@@ -61,8 +91,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
     R15 = cpu->R[15];
+    CodeRegion = cpu->CodeRegion;
 
-    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
     XOR(32, R(RCycles), R(RCycles));
@@ -142,9 +173,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                     else
                     {
                         // could have used a LUT, but then where would be the fun?
-                        BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))));
+                        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-                        skipExecute = J_CC(cond & 1 ? CC_C : CC_NC);
+                        skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z);
                     }
 
                 }
@@ -187,7 +218,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LEA(32, RAX, MDisp(RCycles, ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0);
+    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
     RET();
 
     return res;
@@ -243,23 +274,38 @@ CompileFunc Compiler::GetCompFunc(int kind)
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
+        // Mul
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // ARMv5 stuff
+        NULL, NULL, NULL, NULL, NULL, 
+        // STR
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        // STRB
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDR
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        // LDRB
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // STRH
+        NULL, NULL, NULL, NULL, 
+        // LDRD
+        NULL, NULL, NULL, NULL,
+        // STRD
+        NULL, NULL, NULL, NULL,
+        // LDRH
+        NULL, NULL, NULL, NULL, 
+        // LDRSB
         NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDRSH
+        NULL, NULL, NULL, NULL, 
+        // swap
+        NULL, NULL, 
+        // LDM/STM
+        NULL, NULL,
+        // Branch
+        NULL, NULL, NULL, NULL, NULL,
+        // system stuff
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
 
     const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
@@ -278,10 +324,17 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
         NULL, NULL, NULL, 
-        // mem...
-        NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
+        // LDR pcrel
+        NULL, 
+        // LDR/STR reg offset
+        T_Comp_MemReg, NULL, T_Comp_MemReg, NULL,
+        // LDR/STR sign extended, half 
+        NULL, NULL, NULL, NULL,
+        // LDR/STR imm offset
+        T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, 
+        // LDR/STR half imm offset
+        NULL, NULL,
+        // branch, etc.
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9b454f4..7ab9b25 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -29,7 +29,7 @@ public:
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
-    void UnloadReg(int reg, Gen::X64Reg nativeReg);
+    void SaveReg(int reg, Gen::X64Reg nativeReg);
 
 private:
     CompileFunc GetCompFunc(int kind);
@@ -51,12 +51,17 @@ private:
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
+    void A_Comp_MemWB();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
 
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
     void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
@@ -65,10 +70,14 @@ private:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void* Gen_MemoryRoutine9(bool store, int size, u32 region);
+    void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region);
+
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
+    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
     void SaveCPSR();
@@ -82,6 +91,8 @@ private:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    void* ResetStart;
+
     bool CPSRDirty = false;
 
     FetchedInstr CurrentInstr;
@@ -91,10 +102,16 @@ private:
     bool Thumb;
     u32 Num;
     u32 R15;
+    u32 CodeRegion;
 
     u32 ConstantCycles;
 };
 
+extern void* ReadMemFuncs9[16];
+extern void* ReadMemFuncs7[2][16];
+extern void* WriteMemFuncs9[16];
+extern void* WriteMemFuncs7[2][16];
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index e69de29..d534269 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,600 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../GPU.h"
+#include "../Wifi.h"
+
+namespace NDS
+{
+#define MAIN_RAM_SIZE 0x400000
+extern u8* SWRAM_ARM9;
+extern u32 SWRAM_ARM9Mask;
+extern u8* SWRAM_ARM7;
+extern u32 SWRAM_ARM7Mask;
+extern u8 ARM7WRAM[];
+extern u16 ARM7BIOSProt;
+}
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+void* ReadMemFuncs9[16];
+void* ReadMemFuncs7[2][16];
+void* WriteMemFuncs9[16];
+void* WriteMemFuncs7[2][16];
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
+
+u32 ReadVRAM9(u32 addr)
+{
+    switch (addr & 0x00E00000)
+    {
+        case 0x00000000: return GPU::ReadVRAM_ABG<u32>(addr);
+        case 0x00200000: return GPU::ReadVRAM_BBG<u32>(addr);
+        case 0x00400000: return GPU::ReadVRAM_AOBJ<u32>(addr);
+        case 0x00600000: return GPU::ReadVRAM_BOBJ<u32>(addr);
+        default:         return GPU::ReadVRAM_LCDC<u32>(addr);
+    }
+}
+
+void WriteVRAM9(u32 addr, u32 val)
+{
+    switch (addr & 0x00E00000)
+    {
+        case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
+        case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
+        case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
+        case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
+        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
+    }
+}
+
+/*
+    R11 - data to write (store only)
+    RSCRATCH2 - address
+    RSCRATCH3 - code cycles
+*/
+void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region)
+{
+    AlignCode4();
+    void* res = (void*)GetWritableCodePtr();
+
+    if (!store)
+    {
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        AND(32, R(RSCRATCH), Imm8(0x3));
+        SHL(32, R(RSCRATCH), Imm8(3));
+        // enter the shadow realm!
+        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
+    }
+
+    // cycle counting!
+    // this is AddCycles_CDI
+    MOV(32, R(R10), R(RSCRATCH2));
+    SHR(32, R(R10), Imm8(12));
+    MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2));
+    LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6));
+    CMP(32, R(R10), R(RSCRATCH3));
+    CMOVcc(32, RSCRATCH3, R(R10), CC_G);
+    CMP(32, R(RSCRATCH), R(RSCRATCH3));
+    CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G);
+    ADD(32, R(RCycles), R(RSCRATCH3));
+
+    if (!store)
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+    AND(32, R(RSCRATCH2), Imm32(~3));
+
+    {
+        MOV(32, R(RSCRATCH3), R(RSCRATCH2));
+        SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+        CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+        FixupBranch outsideDTCM = J_CC(CC_AE);
+        AND(32, R(RSCRATCH2), Imm32(0x3FFF));
+        if (!store)
+        {
+            MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)));
+            MOV(32, R(ECX), MDisp(RSP, 8));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
+        else
+            MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11));
+        RET();
+        SetJumpTarget(outsideDTCM);
+        MOV(32, R(RSCRATCH2), R(RSCRATCH3));
+    }
+
+    switch (region)
+    {
+    case 0x00000000:
+    case 0x01000000:
+        {
+            CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+            FixupBranch insideITCM = J_CC(CC_B);
+            RET();
+            SetJumpTarget(insideITCM);
+            AND(32, R(RSCRATCH2), Imm32(0x7FFF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)));
+            else
+            {
+                MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0));
+            }
+        }
+        break;
+    case 0x02000000:
+        AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
+        if (!store)
+            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
+        else
+        {
+            MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
+            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
+            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
+        }
+        break;
+    case 0x03000000:
+        {
+            MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9));
+            TEST(64, R(RSCRATCH3), R(RSCRATCH3));
+            FixupBranch notMapped = J_CC(CC_Z);
+            AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask));
+            if (!store)
+                MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3));
+            else
+            {
+                MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
+            }
+            SetJumpTarget(notMapped);
+        }
+        break;
+    case 0x04000000:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        {
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            ABI_CallFunction(NDS::ARM9IORead32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)NDS::ARM9IOWrite32, true);
+        }
+        break;
+    case 0x05000000:
+        {        
+            MOV(32, R(RSCRATCH), Imm32(1<<1));
+            MOV(32, R(RSCRATCH3), Imm32(1<<9));
+            TEST(32, R(RSCRATCH2), Imm32(0x400));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
+            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
+            FixupBranch available = J_CC(CC_NZ);
+            RET();
+            SetJumpTarget(available);
+            AND(32, R(RSCRATCH2), Imm32(0x7FF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette)));
+            else
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11));
+        }
+        break;
+    case 0x06000000:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        { 
+            ABI_PushRegistersAndAdjustStack({}, 8);
+            ABI_CallFunction(ReadVRAM9);
+            ABI_PopRegistersAndAdjustStack({}, 8);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)WriteVRAM9, true);
+        }
+        break;
+    case 0x07000000:
+        {
+            MOV(32, R(RSCRATCH), Imm32(1<<1));
+            MOV(32, R(RSCRATCH3), Imm32(1<<9));
+            TEST(32, R(RSCRATCH2), Imm32(0x400));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
+            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
+            FixupBranch available = J_CC(CC_NZ);
+            RET();
+            SetJumpTarget(available);
+            AND(32, R(RSCRATCH2), Imm32(0x7FF));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM)));
+            else
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11));
+        }
+        break;
+    case 0x08000000:
+    case 0x09000000:
+    case 0x0A000000:
+        if (!store)
+            MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+        break;
+    case 0xFF000000:
+        if (!store)
+        {
+            AND(32, R(RSCRATCH2), Imm32(0xFFF));
+            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS)));
+        }
+        break;
+    default:
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (!store)
+        {
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            ABI_CallFunction(NDS::ARM9Read32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+        }
+        else
+        {
+            MOV(32, R(ABI_PARAM2), R(R11));
+            JMP((u8*)NDS::ARM9Write32, true);
+        }
+        break;
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ECX), MDisp(RSP, 8));
+        ROR_(32, R(RSCRATCH), R(ECX));
+    }
+
+    RET();
+
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region)
+{
+    AlignCode4();
+    void* res = GetWritableCodePtr();
+
+    if (!store)
+    {
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        AND(32, R(RSCRATCH), Imm8(0x3));
+        SHL(32, R(RSCRATCH), Imm8(3));
+        // enter the shadow realm!
+        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
+    }
+
+    // AddCycles_CDI
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(15));
+    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2)));
+    if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode))
+    {
+        if (!store && region != 0x02000000)
+            LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1));
+        ADD(32, R(RCycles), R(RSCRATCH3));
+    }
+    else
+    {
+        if (!store)
+            ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1));
+        LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3));
+        CMP(32, R(RSCRATCH3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G);
+        CMP(32, R(R10), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(R10), CC_G);
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+
+    if (!store)
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+    AND(32, R(RSCRATCH2), Imm32(~3));
+
+    switch (region)
+    {
+        case 0x00000000:
+            if (!store) {
+                CMP(32, R(RSCRATCH2), Imm32(0x4000));
+                FixupBranch outsideBIOS1 = J_CC(CC_AE);
+
+                MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15])));
+                CMP(32, R(RSCRATCH), Imm32(0x4000));
+                FixupBranch outsideBIOS2 = J_CC(CC_AE);
+                MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt));
+                CMP(32, R(RSCRATCH2), R(RSCRATCH3));
+                FixupBranch notDenied1 = J_CC(CC_AE);
+                CMP(32, R(RSCRATCH), R(RSCRATCH3));
+                FixupBranch notDenied2 = J_CC(CC_B);
+                SetJumpTarget(outsideBIOS2);
+                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+                RET();
+
+                SetJumpTarget(notDenied1);
+                SetJumpTarget(notDenied2);
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS)));
+                MOV(32, R(ECX), MDisp(RSP, 8));
+                ROR_(32, R(RSCRATCH), R(ECX));
+                RET();
+
+                SetJumpTarget(outsideBIOS1);
+            }
+            break;
+        case 0x02000000:
+            AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
+            if (!store)
+                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
+            else
+            {
+                MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
+                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
+            }
+            break;
+        case 0x03000000:
+            {
+                TEST(32, R(RSCRATCH2), Imm32(0x800000));
+                FixupBranch region = J_CC(CC_NZ);
+                MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7));
+                TEST(64, R(RSCRATCH), R(RSCRATCH));
+                FixupBranch notMapped = J_CC(CC_Z);
+                AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask));
+                if (!store)
+                {
+                    MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2));
+                    MOV(32, R(ECX), MDisp(RSP, 8));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else
+                {
+                    MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
+                }
+                RET();
+                SetJumpTarget(region);
+                SetJumpTarget(notMapped);
+                AND(32, R(RSCRATCH2), Imm32(0xFFFF));
+                if (!store)
+                    MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)));
+                else
+                {
+                    MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0));
+                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0));
+                }
+            }
+            break;
+        case 0x04000000:
+            {
+                TEST(32, R(RSCRATCH2), Imm32(0x800000));
+                FixupBranch region = J_CC(CC_NZ);
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                if (!store)
+                {
+                    ABI_PushRegistersAndAdjustStack({}, 8);
+                    ABI_CallFunction(NDS::ARM7IORead32);
+                    ABI_PopRegistersAndAdjustStack({}, 8);
+
+                    MOV(32, R(ECX), MDisp(RSP, 8));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                    RET();
+                }
+                else
+                {
+                    MOV(32, R(ABI_PARAM2), R(R11));
+                    JMP((u8*)NDS::ARM7IOWrite32, true);
+                }
+                SetJumpTarget(region);
+
+                if (!store)
+                {
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    ABI_CallFunction(Wifi::Read);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8);
+
+                    ADD(32, R(RSCRATCH2), Imm8(2));
+                    ABI_PushRegistersAndAdjustStack({EAX}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    ABI_CallFunction(Wifi::Read);
+                    MOV(32, R(RSCRATCH2), R(EAX));
+                    SHL(32, R(RSCRATCH2), Imm8(16));
+                    ABI_PopRegistersAndAdjustStack({EAX}, 8);
+                    OR(32, R(EAX), R(RSCRATCH2));
+                }
+                else
+                {
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    MOVZX(32, 16, ABI_PARAM2, R(R11));
+                    ABI_CallFunction(Wifi::Write);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    SHR(32, R(R11), Imm8(16));
+                    ADD(32, R(RSCRATCH2), Imm8(2));
+                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+                    MOVZX(32, 16, ABI_PARAM2, R(R11));
+                    ABI_CallFunction(Wifi::Write);
+                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
+                }
+            }
+            break;
+        case 0x06000000:
+            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+            if (!store)
+            {
+                ABI_PushRegistersAndAdjustStack({}, 8);
+                ABI_CallFunction(GPU::ReadVRAM_ARM7<u32>);
+                ABI_PopRegistersAndAdjustStack({}, 8);
+            }
+            else
+            {
+                AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1));
+                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0));
+                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0));
+                MOV(32, R(ABI_PARAM2), R(R11));
+                JMP((u8*)GPU::WriteVRAM_ARM7<u32>, true);
+            }
+            break;
+        case 0x08000000:
+        case 0x09000000:
+        case 0x0A000000:
+            if (!store)
+                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
+            break;
+        /*default:
+            ABI_PushRegistersAndAdjustStack({}, 8, 0);
+            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+            ABI_CallFunction(NDS::ARM7Read32);
+            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+            break;*/
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ECX), MDisp(RSP, 8));
+        ROR_(32, R(RSCRATCH), R(ECX));
+    }
+
+    RET();
+
+    return res;
+}
+
+OpArg Compiler::A_Comp_GetMemWBOffset()
+{
+    if (!(CurrentInstr.Instr & (1 << 25)))
+        return Imm32(CurrentInstr.Instr & 0xFFF);
+    else
+    {
+        int op = (CurrentInstr.Instr >> 5) & 0x3;
+        int amount = (CurrentInstr.Instr >> 7) & 0x1F;
+        OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+        bool carryUsed;
+        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
+    }
+}
+
+void Compiler::A_Comp_MemWB()
+{    
+    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    bool load = CurrentInstr.Instr & (1 << 20);
+
+    MOV(32, R(RSCRATCH2), rn);
+    if (CurrentInstr.Instr & (1 << 24))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+        if (CurrentInstr.Instr & (1 << 23))
+            ADD(32, R(RSCRATCH2), offset);
+        else
+            SUB(32, R(RSCRATCH2), offset);
+
+        if (CurrentInstr.Instr & (1 << 21))
+            MOV(32, rn, R(RSCRATCH2));
+    }
+
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles;
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, R(RSCRATCH2), R(RSCRATCH));
+
+    if (!(CurrentInstr.Instr & (1 << 24)))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+
+        if (CurrentInstr.Instr & (1 << 23))
+            ADD(32, rn, offset);
+        else
+            SUB(32, rn, offset);
+    }
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH2));
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
+    OpArg ro = MapReg(CurrentInstr.T_Reg(6));
+
+    int op = (CurrentInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    
+    MOV(32, R(RSCRATCH2), rb);
+    ADD(32, R(RSCRATCH2), ro);
+
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH));
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    // TODO: aufräumen!!!
+    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
+    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
+    
+    int op = (CurrentInstr.Instr >> 11) & 0x3;
+    u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4;
+    bool load = op & 0x1;
+
+    LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset));
+    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
+    MOV(32, R(RSCRATCH3), Imm32(cycles));
+    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    SHR(32, R(RSCRATCH), Imm8(24));
+    AND(32, R(RSCRATCH), Imm8(0xF));
+    void** funcArray;
+    if (load)
+        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
+    else
+    {
+        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
+        MOV(32, R(R11), rd);
+    }
+    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+
+    if (load)
+        MOV(32, rd, R(RSCRATCH));
+}
+
+}
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index e717664..dcd938b 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -83,10 +83,10 @@ enum
     ak_ALU(BIC),
     ak_ALU(MVN),
 
-    ak_ALU(TST),
-    ak_ALU(TEQ),
-    ak_ALU(CMP),
-    ak_ALU(CMN),
+    ak_Test(TST),
+    ak_Test(TEQ),
+    ak_Test(CMP),
+    ak_Test(CMN),
 
     ak_MUL,
     ak_MLA,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0faa57a..ae04ffb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -53,6 +53,7 @@ add_library(core STATIC
 	ARMJIT.cpp
 	ARMJIT_x64/ARMJIT_Compiler.cpp
 	ARMJIT_x64/ARMJIT_ALU.cpp
+	ARMJIT_x64/ARMJIT_LoadStore.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
index 997782e..94336d0 100644
--- a/src/dolphin/x64ABI.h
+++ b/src/dolphin/x64ABI.h
@@ -37,7 +37,8 @@
 
 // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
 #define ABI_ALL_CALLER_SAVED                                                                       \
-  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11})
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16,           \
+      XMM4 + 16, XMM5 + 16})
 #else  // 64-bit Unix / OS X
 
 #define ABI_PARAM1 RDI
-- 
cgit v1.2.3


From 10e386fe50af1a11ada54a380f6802025fca8efd Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 6 Jul 2019 01:48:42 +0200
Subject: JIT: most mem instructions working + branching

---
 src/ARM.cpp                         |  10 +-
 src/ARMJIT.cpp                      |   7 +-
 src/ARMJIT.h                        |   2 +-
 src/ARMJIT_RegCache.h               |   2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 322 ++++++++-------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 145 ++++---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  42 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 805 +++++++++++++++---------------------
 src/ARM_InstrInfo.cpp               |   2 +-
 src/NDS.cpp                         |   2 +
 10 files changed, 653 insertions(+), 686 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index eadedc7..df58ce3 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -565,8 +565,9 @@ void ARMv5::Execute()
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
-            block = ARMJIT::CompileBlock(this);
-        Cycles += block();
+            ARMJIT::CompileBlock(this);
+        else
+            Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
@@ -650,8 +651,9 @@ void ARMv4::Execute()
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
         if (block == NULL)
-            block = ARMJIT::CompileBlock(this);
-        Cycles += block();
+            ARMJIT::CompileBlock(this);
+        else
+            Cycles += block();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 4da781c..6afa967 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -121,12 +121,13 @@ void DeInit()
 	delete compiler;
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
     FetchedInstr instrs[12];
     int i = 0;
+	u32 r15Initial = cpu->R[15];
     u32 r15 = cpu->R[15];
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
     //printf("block %x %d\n", r15, thumb);
@@ -169,9 +170,7 @@ CompiledBlock CompileBlock(ARM* cpu)
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block);
-
-    return block;
+    InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
 }
 
 void ResetBlocks()
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 45bb4ed..71188f9 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void CompileBlock(ARM* cpu);
 
 void ResetBlocks();
 
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
index ea9fb30..556d27b 100644
--- a/src/ARMJIT_RegCache.h
+++ b/src/ARMJIT_RegCache.h
@@ -114,7 +114,7 @@ public:
             for (int reg : needToBeLoaded)
                 LoadRegister(reg);
         }
-        DirtyRegs |= Instr.Info.DstRegs;
+        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 6294e1d..c22751e 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -71,30 +71,30 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
 {
     switch (op)
     {
-        case 0: // TST
-            if (rn.IsImm())
-            {
-                MOV(32, R(RSCRATCH3), rn);
-                rn = R(RSCRATCH3);
-            }
-            TEST(32, rn, op2);
-        break;
-        case 1: // TEQ
+    case 0: // TST
+        if (rn.IsImm())
+        {
             MOV(32, R(RSCRATCH3), rn);
-            XOR(32, R(RSCRATCH3), op2);
-        break;
-        case 2: // CMP
-            if (rn.IsImm())
-            {
-                MOV(32, R(RSCRATCH3), rn);
-                rn = R(RSCRATCH3);
-            }
-            CMP(32, rn, op2);
-        break;
-        case 3: // CMN
+            rn = R(RSCRATCH3);
+        }
+        TEST(32, rn, op2);
+    break;
+    case 1: // TEQ
+        MOV(32, R(RSCRATCH3), rn);
+        XOR(32, R(RSCRATCH3), op2);
+    break;
+    case 2: // CMP
+        if (rn.IsImm())
+        {
             MOV(32, R(RSCRATCH3), rn);
-            ADD(32, R(RSCRATCH3), op2);
-        break;
+            rn = R(RSCRATCH3);
+        }
+        CMP(32, rn, op2);
+    break;
+    case 3: // CMN
+        MOV(32, R(RSCRATCH3), rn);
+        ADD(32, R(RSCRATCH3), op2);
+    break;
     }
 
     Comp_RetriveFlags(op == 2, op >= 2, carryUsed);
@@ -103,38 +103,38 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
 // also calculates cycles
 OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
 {
-    if (CurrentInstr.Instr & (1 << 25))
+    if (CurInstr.Instr & (1 << 25))
     {
         Comp_AddCycles_C();
         carryUsed = false;
-        return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E));
+        return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));
     }
     else
     {
-        int op = (CurrentInstr.Instr >> 5) & 0x3;
-        if (CurrentInstr.Instr & (1 << 4))
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        if (CurInstr.Instr & (1 << 4))
         {
             Comp_AddCycles_CI(1);
-            OpArg rm = MapReg(CurrentInstr.A_Reg(0));
-            if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15)
+            OpArg rm = MapReg(CurInstr.A_Reg(0));
+            if (rm.IsImm() && CurInstr.A_Reg(0) == 15)
                 rm = Imm32(rm.Imm32() + 4);
-            return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed);
+            return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed);
         }
         else
         {
             Comp_AddCycles_C();
-            return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F,
-                    MapReg(CurrentInstr.A_Reg(0)), S, carryUsed);
+            return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F,
+                    MapReg(CurInstr.A_Reg(0)), S, carryUsed);
         }
     }
 }
 
 void Compiler::A_Comp_CmpOp()
 {
-    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
 
     bool carryUsed;
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
     OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed);
 
     Comp_CmpOp(op - 0x8, rn, op2, carryUsed);
@@ -142,12 +142,12 @@ void Compiler::A_Comp_CmpOp()
 
 void Compiler::A_Comp_Arith()
 {
-    bool S = CurrentInstr.Instr & (1 << 20);
-    u32 op = (CurrentInstr.Instr >> 21) & 0xF;
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
 
     bool carryUsed;
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
     OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed);
 
     u32 sFlag = S ? opSetsFlags : 0;
@@ -155,13 +155,13 @@ void Compiler::A_Comp_Arith()
     {
     case 0x0: // AND
         Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0x1: // EOR
         Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0x2: // SUB
         Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
-        return;
+        break;
     case 0x3: // RSB
         if (op2.IsZero())
         {
@@ -173,41 +173,44 @@ void Compiler::A_Comp_Arith()
         }
         else
             Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
-        return;
+        break;
     case 0x4: // ADD
         Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
-        return;
+        break;
     case 0x5: // ADC
         Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
-        return;
+        break;
     case 0x6: // SBC
         Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
-        return;
+        break;
     case 0x7: // RSC
         Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
-        return;
+        break;
     case 0xC: // ORR
         Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
-        return;
+        break;
     case 0xE: // BIC
         Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
-        return;
+        break;
     default:
         assert("unimplemented");
     }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
 void Compiler::A_Comp_MovOp()
 {
     bool carryUsed;
-    bool S = CurrentInstr.Instr & (1 << 20);
+    bool S = CurInstr.Instr & (1 << 20);
     OpArg op2 = A_Comp_GetALUOp2(S, carryUsed);
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
 
     if (rd != op2)
         MOV(32, rd, op2);
 
-    if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF)
+    if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
         NOT(32, rd);
 
     if (S)
@@ -215,6 +218,9 @@ void Compiler::A_Comp_MovOp()
         TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, carryUsed);
     }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
@@ -230,7 +236,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
     }
 
     if (carryUsed == 983298)
-        printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr);
+        printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr);
 
     SETcc(CC_S, R(RSCRATCH));
     SETcc(CC_Z, R(RSCRATCH3));
@@ -324,61 +330,61 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car
 
     switch (op)
     {
-        case 0: // LSL
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), rm);
-                SHL(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-
-                return R(RSCRATCH);
-            }
-            else
-            {
-                carryUsed = false;
-                return rm;
-            }
-        case 1: // LSR
-            if (amount > 0)
-            {
-                MOV(32, R(RSCRATCH), rm);
-                SHR(32, R(RSCRATCH), Imm8(amount));
-                if (S)
-                    SETcc(CC_C, R(RSCRATCH2));
-                return R(RSCRATCH);
-            }
-            else
-            {
-                if (S)
-                {
-                    MOV(32, R(RSCRATCH2), rm);
-                    SHR(32, R(RSCRATCH2), Imm8(31));
-                }
-                return Imm32(0);
-            }
-        case 2: // ASR
+    case 0: // LSL
+        if (amount > 0)
+        {
             MOV(32, R(RSCRATCH), rm);
-            SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+            SHL(32, R(RSCRATCH), Imm8(amount));
             if (S)
-            {
-                if (amount == 0)
-                    BT(32, rm, Imm8(31));
                 SETcc(CC_C, R(RSCRATCH2));
-            }
+
             return R(RSCRATCH);
-        case 3: // ROR
+        }
+        else
+        {
+            carryUsed = false;
+            return rm;
+        }
+    case 1: // LSR
+        if (amount > 0)
+        {
             MOV(32, R(RSCRATCH), rm);
-            if (amount > 0)
-                ROR_(32, R(RSCRATCH), Imm8(amount));
-            else
-            {
-                BT(32, R(RCPSR), Imm8(29));
-                RCR(32, R(RSCRATCH), Imm8(1));
-            }
+            SHR(32, R(RSCRATCH), Imm8(amount));
             if (S)
                 SETcc(CC_C, R(RSCRATCH2));
             return R(RSCRATCH);
+        }
+        else
+        {
+            if (S)
+            {
+                MOV(32, R(RSCRATCH2), rm);
+                SHR(32, R(RSCRATCH2), Imm8(31));
+            }
+            return Imm32(0);
+        }
+    case 2: // ASR
+        MOV(32, R(RSCRATCH), rm);
+        SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+        if (S)
+        {
+            if (amount == 0)
+                BT(32, rm, Imm8(31));
+            SETcc(CC_C, R(RSCRATCH2));
+        }
+        return R(RSCRATCH);
+    case 3: // ROR
+        MOV(32, R(RSCRATCH), rm);
+        if (amount > 0)
+            ROR_(32, R(RSCRATCH), Imm8(amount));
+        else
+        {
+            BT(32, R(RCPSR), Imm8(29));
+            RCR(32, R(RSCRATCH), Imm8(1));
+        }
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+        return R(RSCRATCH);
     }
 
     assert(false);
@@ -386,11 +392,11 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car
 
 void Compiler::T_Comp_ShiftImm()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    int op = (CurrentInstr.Instr >> 11) & 0x3;
-    int amount = (CurrentInstr.Instr >> 6) & 0x1F;
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
 
     Comp_AddCycles_C();
 
@@ -406,12 +412,12 @@ void Compiler::T_Comp_ShiftImm()
 
 void Compiler::T_Comp_AddSub_()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    int op = (CurrentInstr.Instr >> 9) & 0x3;
+    int op = (CurInstr.Instr >> 9) & 0x3;
 
-    OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6));
+    OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6));
     
     Comp_AddCycles_C();
 
@@ -423,38 +429,38 @@ void Compiler::T_Comp_AddSub_()
 
 void Compiler::T_Comp_ALU_Imm8()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(8));
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
 
-    u32 op = (CurrentInstr.Instr >> 11) & 0x3;
-    OpArg imm = Imm32(CurrentInstr.Instr & 0xFF);
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    OpArg imm = Imm32(CurInstr.Instr & 0xFF);
 
     Comp_AddCycles_C();
 
     switch (op)
     {
-        case 0x0:
-            MOV(32, rd, imm);
-            TEST(32, rd, rd);
-            Comp_RetriveFlags(false, false, false);
-            return;
-        case 0x1:
-            Comp_CmpOp(2, rd, imm, false);
-            return;
-        case 0x2:
-            Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
-            return;
-        case 0x3:
-            Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
-            return;
+    case 0x0:
+        MOV(32, rd, imm);
+        TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    case 0x1:
+        Comp_CmpOp(2, rd, imm, false);
+        return;
+    case 0x2:
+        Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+        return;
+    case 0x3:
+        Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        return;
     }
 }
 
 void Compiler::T_Comp_ALU()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rs = MapReg(CurrentInstr.T_Reg(3));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
 
-    u32 op = (CurrentInstr.Instr >> 6) & 0xF;
+    u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
         Comp_AddCycles_CI(1);
@@ -522,28 +528,62 @@ void Compiler::T_Comp_ALU()
 
 void Compiler::T_Comp_ALU_HiReg()
 {
-    OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8)));
-    OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF);
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    OpArg rdMapped = MapReg(rd);
+    OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
 
-    u32 op = (CurrentInstr.Instr >> 8) & 0x3;
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
 
     Comp_AddCycles_C();
 
     switch (op)
     {
-        case 0x0: // ADD
-            Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV);
-            return;
-        case 0x1: // CMP
-            Comp_CmpOp(2, rd, rs, false);
-            return;
-        case 0x2: // MOV
-            if (rd != rs)
-                MOV(32, rd, rs);
-            TEST(32, rd, rd);
-            Comp_RetriveFlags(false, false, false);
-            return;
+    case 0x0: // ADD
+        Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        break;
+    case 0x1: // CMP
+        Comp_CmpOp(2, rdMapped, rs, false);
+        return; // this is on purpose
+    case 0x2: // MOV
+        if (rdMapped != rs)
+            MOV(32, rdMapped, rs);
+        TEST(32, rdMapped, rdMapped);
+        Comp_RetriveFlags(false, false, false);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        OR(32, rdMapped, Imm8(1));
+        Comp_JumpTo(rdMapped.GetSimpleReg());
     }
 }
 
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    OpArg sp = MapReg(13);
+    OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2);
+    if (CurInstr.Instr & (1 << 7))
+        SUB(32, sp, offset);
+    else
+        ADD(32, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        OpArg sp = MapReg(13);
+        LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset));
+    }
+    else
+        MOV(32, rd, Imm32((R15 & ~2) + offset));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 9096397..b7358a2 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,7 +9,7 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = 
+const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
     RBX, RSI, RDI, R12, R13
@@ -18,7 +18,7 @@ const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 #endif
 };
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = 
+const int RegCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
     5
 #else
@@ -30,24 +30,33 @@ Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 16);
 
-    for (int i = 0; i < 15; i++)
+    for (int i = 0; i < 3; i++)
     {
-        ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i);
-        WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i);
         for (int j = 0; j < 2; j++)
         {
-            ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i);
-            WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i);
+            MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
+            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
+            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
         }
     }
-    ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000);
-    WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000);
-    ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000);
-    WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000);
 
     ResetStart = GetWritableCodePtr();
 }
 
+DataRegion Compiler::ClassifyAddress(u32 addr)
+{
+    if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
+        return dataRegionDTCM;
+    switch (addr & 0xFF000000)
+    {
+        case 0x02000000: return dataRegionMainRAM;
+        case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM;
+        case 0x04000000: return dataRegionIO;
+        case 0x06000000: return dataRegionVRAM;
+    }
+    return dataRegionGeneric;
+}
+
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -92,6 +101,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     Num = cpu->Num;
     R15 = cpu->R[15];
     CodeRegion = cpu->CodeRegion;
+    CurCPU = cpu;
 
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
@@ -106,27 +116,32 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
-        CurrentInstr = instrs[i];
-
-        CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind);
+        CurInstr = instrs[i];
 
-        if (CurrentInstr.Info.Branches())
-            comp = NULL;
+        CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
 
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
             if (i == instrsCount - 1)
             {
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0]));
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0]));
+                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
             }
 
-            SaveCPSR();
+            if (comp == NULL || CurInstr.Info.Branches())
+                SaveCPSR();
         }
 
+        // run interpreter
+        cpu->CodeCycles = CurInstr.CodeCycles;
+        cpu->R[15] = R15;
+        cpu->CurInstr = CurInstr.Instr;
+        cpu->NextInstr[0] = CurInstr.NextInstr[0];
+        cpu->NextInstr[1] = CurInstr.NextInstr[1];
+
         if (comp != NULL)
             RegCache.Prepare(i);
         else
@@ -134,26 +149,33 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
+            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF;
                 ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
             else
                 (this->*comp)();
+
+            ARMInterpreter::THUMBInstrTable[icode](cpu);
         }
         else
         {
-            u32 cond = CurrentInstr.Cond();
-            if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
                 ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+
+                ARMInterpreter::A_BLX_IMM(cpu);
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+                cpu->AddCycles_C();
+            }
             else
             {
                 FixupBranch skipExecute;
@@ -180,18 +202,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                 }
 
+                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0);
                     ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
                 }
                 else
                     (this->*comp)();
 
                 FixupBranch skipFailed;
-                if (CurrentInstr.Cond() < 0xE)
+                if (CurInstr.Cond() < 0xE)
                 {
                     skipFailed = J();
                     SetJumpTarget(skipExecute);
@@ -200,13 +222,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                     SetJumpTarget(skipFailed);
                 }
+
+                if (cpu->CheckCondition(cond))
+                    ARMInterpreter::ARMInstrTable[icode](cpu);
+                else
+                    cpu->AddCycles_C();
             }
         }
 
         /*
             we don't need to collect the interpreted cycles,
-            since all functions only add to it, the dispatcher
-            takes care of it.
+            since cpu->Cycles is taken into account by the dispatcher.
         */
 
         if (comp == NULL && i != instrsCount - 1)
@@ -277,29 +303,29 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // Mul
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
-        NULL, NULL, NULL, NULL, NULL, 
+        NULL, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRB
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDRB
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRD
         NULL, NULL, NULL, NULL,
         // STRD
         NULL, NULL, NULL, NULL,
         // LDRH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSB
-        NULL, NULL, NULL, NULL,
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSH
-        NULL, NULL, NULL, NULL, 
+        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // swap
-        NULL, NULL, 
+        NULL, NULL,
         // LDM/STM
         NULL, NULL,
         // Branch
@@ -314,26 +340,26 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // Three operand ADD/SUB
         T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
         // 8 bit imm
-        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, 
+        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8,
         // general ALU
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, 
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
         // hi reg
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
-        NULL, NULL, NULL, 
+        T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
         // LDR pcrel
-        NULL, 
+        NULL,
         // LDR/STR reg offset
-        T_Comp_MemReg, NULL, T_Comp_MemReg, NULL,
-        // LDR/STR sign extended, half 
-        NULL, NULL, NULL, NULL,
+        T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
+        // LDR/STR sign extended, half
+        T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf,
         // LDR/STR imm offset
-        T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, 
+        T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
         // LDR/STR half imm offset
-        NULL, NULL,
+        T_Comp_MemImmHalf, T_Comp_MemImmHalf,
         // branch, etc.
         NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL,
@@ -346,10 +372,10 @@ CompileFunc Compiler::GetCompFunc(int kind)
 void Compiler::Comp_AddCycles_C()
 {
     s32 cycles = Num ?
-        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3]
-        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles);
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (CurrentInstr.Cond() < 0xE)
+    if (CurInstr.Cond() < 0xE)
         ADD(32, R(RCycles), Imm8(cycles));
     else
         ConstantCycles += cycles;
@@ -358,13 +384,26 @@ void Compiler::Comp_AddCycles_C()
 void Compiler::Comp_AddCycles_CI(u32 i)
 {
     s32 cycles = (Num ?
-        NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2]
-        : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i;
-    
-    if (CurrentInstr.Cond() < 0xE)
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
+
+    if (CurInstr.Cond() < 0xE)
         ADD(32, R(RCycles), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
+void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
+{
+    SaveCPSR();
+
+    MOV(64, R(ABI_PARAM1), R(RCPU));
+    MOV(32, R(ABI_PARAM2), R(addr));
+    MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+    if (Num == 0)
+        CALL((void*)&ARMv5::JumpTo);
+    else
+        CALL((void*)&ARMv4::JumpTo);
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 7ab9b25..9395a29 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -6,6 +6,8 @@
 #include "../ARMJIT.h"
 #include "../ARMJIT_RegCache.h"
 
+#include <tuple>
+
 namespace ARMJIT
 {
 
@@ -21,6 +23,19 @@ class Compiler;
 
 typedef void (Compiler::*CompileFunc)();
 
+enum DataRegion
+{
+    dataRegionGeneric, // hey, that's me!
+    dataRegionMainRAM,
+    dataRegionSWRAM,
+    dataRegionVRAM,
+    dataRegionIO,
+    dataRegionExclusive,
+    dataRegionsCount,
+    dataRegionDTCM = dataRegionExclusive,
+    dataRegionWRAM7 = dataRegionExclusive,
+};
+
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -34,6 +49,8 @@ public:
 private:
     CompileFunc GetCompFunc(int kind);
 
+    void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
+
     void Comp_AddCycles_C();
     void Comp_AddCycles_CI(u32 i);
 
@@ -47,11 +64,14 @@ private:
         opInvertOp2 = 1 << 5,
     };
 
+    DataRegion ClassifyAddress(u32 addr);
+
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
     void A_Comp_MemWB();
+    void A_Comp_MemHalf();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -59,8 +79,15 @@ private:
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
 
+    void T_Comp_RelAddr();
+    void T_Comp_AddSP();
+
     void T_Comp_MemReg();
     void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+
+    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -70,8 +97,8 @@ private:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void* Gen_MemoryRoutine9(bool store, int size, u32 region);
-    void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region);
+    void* Gen_MemoryRoutine9(bool store, int size);
+    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
@@ -92,10 +119,12 @@ private:
     }
 
     void* ResetStart;
+    void* MemoryFuncs9[3][2];
+    void* MemoryFuncs7[3][2][2];
 
     bool CPSRDirty = false;
 
-    FetchedInstr CurrentInstr;
+    FetchedInstr CurInstr;
 
     RegCache<Compiler, Gen::X64Reg> RegCache;
 
@@ -105,12 +134,9 @@ private:
     u32 CodeRegion;
 
     u32 ConstantCycles;
-};
 
-extern void* ReadMemFuncs9[16];
-extern void* ReadMemFuncs7[2][16];
-extern void* WriteMemFuncs9[16];
-extern void* WriteMemFuncs7[2][16];
+    ARM* CurCPU;
+};
 
 }
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index d534269..69746e2 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -5,7 +5,6 @@
 
 namespace NDS
 {
-#define MAIN_RAM_SIZE 0x400000
 extern u8* SWRAM_ARM9;
 extern u32 SWRAM_ARM9Mask;
 extern u8* SWRAM_ARM7;
@@ -19,11 +18,6 @@ using namespace Gen;
 namespace ARMJIT
 {
 
-void* ReadMemFuncs9[16];
-void* ReadMemFuncs7[2][16];
-void* WriteMemFuncs9[16];
-void* WriteMemFuncs7[2][16];
-
 template <typename T>
 int squeezePointer(T* ptr)
 {
@@ -32,569 +26,434 @@ int squeezePointer(T* ptr)
     return truncated;
 }
 
-u32 ReadVRAM9(u32 addr)
-{
-    switch (addr & 0x00E00000)
-    {
-        case 0x00000000: return GPU::ReadVRAM_ABG<u32>(addr);
-        case 0x00200000: return GPU::ReadVRAM_BBG<u32>(addr);
-        case 0x00400000: return GPU::ReadVRAM_AOBJ<u32>(addr);
-        case 0x00600000: return GPU::ReadVRAM_BOBJ<u32>(addr);
-        default:         return GPU::ReadVRAM_LCDC<u32>(addr);
-    }
-}
+/*
+    According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
+    of all memory load and store instructions always access addresses in the same region as
+    during the their first execution.
 
-void WriteVRAM9(u32 addr, u32 val)
-{
-    switch (addr & 0x00E00000)
-    {
-        case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
-        case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
-        case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
-        case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
-    }
-}
+    I tried multiple optimisations, which would benefit from this behaviour
+    (having fast paths for the first region, …), though none of them yielded a measureable
+    improvement.
+*/
 
 /*
-    R11 - data to write (store only)
-    RSCRATCH2 - address
-    RSCRATCH3 - code cycles
+    address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
+    store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
+    code cycles - ABI_PARAM3
 */
-void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region)
+void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
+    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
     AlignCode4();
-    void* res = (void*)GetWritableCodePtr();
+    void* res = GetWritableCodePtr();
 
-    if (!store)
-    {
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        AND(32, R(RSCRATCH), Imm8(0x3));
-        SHL(32, R(RSCRATCH), Imm8(3));
-        // enter the shadow realm!
-        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
-    }
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+    FixupBranch insideDTCM = J_CC(CC_B);
 
-    // cycle counting!
-    // this is AddCycles_CDI
-    MOV(32, R(R10), R(RSCRATCH2));
-    SHR(32, R(R10), Imm8(12));
-    MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2));
-    LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6));
-    CMP(32, R(R10), R(RSCRATCH3));
-    CMOVcc(32, RSCRATCH3, R(R10), CC_G);
-    CMP(32, R(RSCRATCH), R(RSCRATCH3));
-    CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G);
-    ADD(32, R(RCycles), R(RSCRATCH3));
-
-    if (!store)
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-    AND(32, R(RSCRATCH2), Imm32(~3));
+    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+    FixupBranch insideITCM = J_CC(CC_B);
 
+    // cycle counting!
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(12));
+    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0)));
+    LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6));
+    CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+    CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
+    CMP(32, R(ABI_PARAM4), R(RSCRATCH));
+    CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G);
+    ADD(32, R(RCycles), R(RSCRATCH));
+
+    if (store)
     {
-        MOV(32, R(RSCRATCH3), R(RSCRATCH2));
-        SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-        CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-        FixupBranch outsideDTCM = J_CC(CC_AE);
-        AND(32, R(RSCRATCH2), Imm32(0x3FFF));
-        if (!store)
+        if (size > 8)
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+        switch (size)
         {
-            MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)));
-            MOV(32, R(ECX), MDisp(RSP, 8));
-            ROR_(32, R(RSCRATCH), R(ECX));
+        case 32: JMP((u8*)NDS::ARM9Write32, true); break;
+        case 16: JMP((u8*)NDS::ARM9Write16, true); break;
+        case 8: JMP((u8*)NDS::ARM9Write8, true); break;
         }
-        else
-            MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11));
-        RET();
-        SetJumpTarget(outsideDTCM);
-        MOV(32, R(RSCRATCH2), R(RSCRATCH3));
     }
-
-    switch (region)
+    else
     {
-    case 0x00000000:
-    case 0x01000000:
-        {
-            CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-            FixupBranch insideITCM = J_CC(CC_B);
-            RET();
-            SetJumpTarget(insideITCM);
-            AND(32, R(RSCRATCH2), Imm32(0x7FFF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)));
-            else
-            {
-                MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0));
-            }
-        }
-        break;
-    case 0x02000000:
-        AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
-        if (!store)
-            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
-        else
-        {
-            MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
-            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
-            MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
-        }
-        break;
-    case 0x03000000:
-        {
-            MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9));
-            TEST(64, R(RSCRATCH3), R(RSCRATCH3));
-            FixupBranch notMapped = J_CC(CC_Z);
-            AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask));
-            if (!store)
-                MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3));
-            else
-            {
-                MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
-            }
-            SetJumpTarget(notMapped);
-        }
-        break;
-    case 0x04000000:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
-        {
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            ABI_CallFunction(NDS::ARM9IORead32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
-        }
-        else
-        {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)NDS::ARM9IOWrite32, true);
-        }
-        break;
-    case 0x05000000:
-        {        
-            MOV(32, R(RSCRATCH), Imm32(1<<1));
-            MOV(32, R(RSCRATCH3), Imm32(1<<9));
-            TEST(32, R(RSCRATCH2), Imm32(0x400));
-            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
-            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
-            FixupBranch available = J_CC(CC_NZ);
-            RET();
-            SetJumpTarget(available);
-            AND(32, R(RSCRATCH2), Imm32(0x7FF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette)));
-            else
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11));
-        }
-        break;
-    case 0x06000000:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
-        { 
-            ABI_PushRegistersAndAdjustStack({}, 8);
-            ABI_CallFunction(ReadVRAM9);
-            ABI_PopRegistersAndAdjustStack({}, 8);
-        }
-        else
-        {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)WriteVRAM9, true);
-        }
-        break;
-    case 0x07000000:
+        if (size == 32)
         {
-            MOV(32, R(RSCRATCH), Imm32(1<<1));
-            MOV(32, R(RSCRATCH3), Imm32(1<<9));
-            TEST(32, R(RSCRATCH2), Imm32(0x400));
-            CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ);
-            TEST(16, R(RSCRATCH), M(&NDS::PowerControl9));
-            FixupBranch available = J_CC(CC_NZ);
+            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            // everything's already in the appropriate register
+            ABI_CallFunction(NDS::ARM9Read32);
+            ABI_PopRegistersAndAdjustStack({ECX}, 8);
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
             RET();
-            SetJumpTarget(available);
-            AND(32, R(RSCRATCH2), Imm32(0x7FF));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM)));
-            else
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11));
         }
-        break;
-    case 0x08000000:
-    case 0x09000000:
-    case 0x0A000000:
-        if (!store)
-            MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-        break;
-    case 0xFF000000:
-        if (!store)
-        {
-            AND(32, R(RSCRATCH2), Imm32(0xFFF));
-            MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS)));
-        }
-        break;
-    default:
-        MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-        if (!store)
+        else if (size == 16)
         {
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            ABI_CallFunction(NDS::ARM9Read32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            JMP((u8*)NDS::ARM9Read16, true);
         }
         else
+            JMP((u8*)NDS::ARM9Read8, true);
+    }
+
+    SetJumpTarget(insideDTCM);
+    ADD(32, R(RCycles), R(ABI_PARAM3));
+    AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
+    if (store)
+        MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
+        if (size == 32)
         {
-            MOV(32, R(ABI_PARAM2), R(R11));
-            JMP((u8*)NDS::ARM9Write32, true);
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
         }
-        break;
     }
+    RET();
 
-    if (!store)
+    SetJumpTarget(insideITCM);
+    ADD(32, R(RCycles), R(ABI_PARAM3));
+    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
+    AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
+    if (store)
     {
-        MOV(32, R(ECX), MDisp(RSP, 8));
-        ROR_(32, R(RSCRATCH), R(ECX));
+        MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
+        if (size == 32)
+            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+    }
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)));
+        if (size == 32)
+        {
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
     }
-
     RET();
 
+    static_assert(RSCRATCH == EAX);
+
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region)
+void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
 {
+    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
     AlignCode4();
     void* res = GetWritableCodePtr();
 
-    if (!store)
-    {
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        AND(32, R(RSCRATCH), Imm8(0x3));
-        SHL(32, R(RSCRATCH), Imm8(3));
-        // enter the shadow realm!
-        MOV(32, MDisp(RSP, 8), R(RSCRATCH));
-    }
-
-    // AddCycles_CDI
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2)));
-    if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode))
+    MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0xFF000000));
+    CMP(32, R(RSCRATCH), Imm32(0x02000000));
+    FixupBranch outsideMainRAM = J_CC(CC_NE);
+    if (codeMainRAM)
     {
-        if (!store && region != 0x02000000)
-            LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1));
-        ADD(32, R(RCycles), R(RSCRATCH3));
+        LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3));
+        ADD(32, R(RCycles), R(RSCRATCH));
     }
     else
     {
         if (!store)
-            ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1));
-        LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3));
-        CMP(32, R(RSCRATCH3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G);
-        CMP(32, R(R10), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(R10), CC_G);
+            ADD(32, R(ABI_PARAM3), Imm8(1));
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
+        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
+        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
+        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
         ADD(32, R(RCycles), R(RSCRATCH));
     }
-
-    if (!store)
+    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
+    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
+    if (store)
+    {
+        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
         XOR(32, R(RSCRATCH), R(RSCRATCH));
-    AND(32, R(RSCRATCH2), Imm32(~3));
+        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
+        if (size == 32)
+            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
+    }
+    else
+    {
+        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
+        if (size == 32)
+        {
+            if (ABI_PARAM1 != ECX)
+                MOV(32, R(ECX), R(ABI_PARAM1));
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+        }
+    }
+    RET();
 
-    switch (region)
+    SetJumpTarget(outsideMainRAM);
+    if (codeMainRAM)
+    {
+        if (!store)
+            ADD(32, R(ABI_PARAM4), Imm8(1));
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
+        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
+        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
+        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+    else
+    {
+        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1));
+        ADD(32, R(RCycles), R(RSCRATCH));
+    }
+    if (store)
+    {
+        if (size > 8)
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+        switch (size)
+        {
+        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
+        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
+        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
+        }
+    }
+    else
     {
-        case 0x00000000:
-            if (!store) {
-                CMP(32, R(RSCRATCH2), Imm32(0x4000));
-                FixupBranch outsideBIOS1 = J_CC(CC_AE);
-
-                MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15])));
-                CMP(32, R(RSCRATCH), Imm32(0x4000));
-                FixupBranch outsideBIOS2 = J_CC(CC_AE);
-                MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt));
-                CMP(32, R(RSCRATCH2), R(RSCRATCH3));
-                FixupBranch notDenied1 = J_CC(CC_AE);
-                CMP(32, R(RSCRATCH), R(RSCRATCH3));
-                FixupBranch notDenied2 = J_CC(CC_B);
-                SetJumpTarget(outsideBIOS2);
-                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-                RET();
-
-                SetJumpTarget(notDenied1);
-                SetJumpTarget(notDenied2);
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS)));
-                MOV(32, R(ECX), MDisp(RSP, 8));
-                ROR_(32, R(RSCRATCH), R(ECX));
-                RET();
-
-                SetJumpTarget(outsideBIOS1);
-            }
-            break;
-        case 0x02000000:
-            AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1));
-            if (!store)
-                MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)));
-            else
-            {
-                MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0));
-                MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0));
-            }
-            break;
-        case 0x03000000:
-            {
-                TEST(32, R(RSCRATCH2), Imm32(0x800000));
-                FixupBranch region = J_CC(CC_NZ);
-                MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7));
-                TEST(64, R(RSCRATCH), R(RSCRATCH));
-                FixupBranch notMapped = J_CC(CC_Z);
-                AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask));
-                if (!store)
-                {
-                    MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2));
-                    MOV(32, R(ECX), MDisp(RSP, 8));
-                    ROR_(32, R(RSCRATCH), R(ECX));
-                }
-                else
-                {
-                    MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0));
-                }
-                RET();
-                SetJumpTarget(region);
-                SetJumpTarget(notMapped);
-                AND(32, R(RSCRATCH2), Imm32(0xFFFF));
-                if (!store)
-                    MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)));
-                else
-                {
-                    MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0));
-                    MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0));
-                }
-            }
-            break;
-        case 0x04000000:
-            {
-                TEST(32, R(RSCRATCH2), Imm32(0x800000));
-                FixupBranch region = J_CC(CC_NZ);
-                MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                if (!store)
-                {
-                    ABI_PushRegistersAndAdjustStack({}, 8);
-                    ABI_CallFunction(NDS::ARM7IORead32);
-                    ABI_PopRegistersAndAdjustStack({}, 8);
-
-                    MOV(32, R(ECX), MDisp(RSP, 8));
-                    ROR_(32, R(RSCRATCH), R(ECX));
-                    RET();
-                }
-                else
-                {
-                    MOV(32, R(ABI_PARAM2), R(R11));
-                    JMP((u8*)NDS::ARM7IOWrite32, true);
-                }
-                SetJumpTarget(region);
-
-                if (!store)
-                {
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    ABI_CallFunction(Wifi::Read);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8);
-
-                    ADD(32, R(RSCRATCH2), Imm8(2));
-                    ABI_PushRegistersAndAdjustStack({EAX}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    ABI_CallFunction(Wifi::Read);
-                    MOV(32, R(RSCRATCH2), R(EAX));
-                    SHL(32, R(RSCRATCH2), Imm8(16));
-                    ABI_PopRegistersAndAdjustStack({EAX}, 8);
-                    OR(32, R(EAX), R(RSCRATCH2));
-                }
-                else
-                {
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    MOVZX(32, 16, ABI_PARAM2, R(R11));
-                    ABI_CallFunction(Wifi::Write);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    SHR(32, R(R11), Imm8(16));
-                    ADD(32, R(RSCRATCH2), Imm8(2));
-                    ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-                    MOVZX(32, 16, ABI_PARAM2, R(R11));
-                    ABI_CallFunction(Wifi::Write);
-                    ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8);
-                }
-            }
-            break;
-        case 0x06000000:
-            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
-            if (!store)
-            {
-                ABI_PushRegistersAndAdjustStack({}, 8);
-                ABI_CallFunction(GPU::ReadVRAM_ARM7<u32>);
-                ABI_PopRegistersAndAdjustStack({}, 8);
-            }
-            else
-            {
-                AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1));
-                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0));
-                MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0));
-                MOV(32, R(ABI_PARAM2), R(R11));
-                JMP((u8*)GPU::WriteVRAM_ARM7<u32>, true);
-            }
-            break;
-        case 0x08000000:
-        case 0x09000000:
-        case 0x0A000000:
-            if (!store)
-                MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF));
-            break;
-        /*default:
-            ABI_PushRegistersAndAdjustStack({}, 8, 0);
-            MOV(32, R(ABI_PARAM1), R(RSCRATCH2));
+        if (size == 32)
+        {
+            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
             ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({}, 8, 0);
-            break;*/
+            ABI_PopRegistersAndAdjustStack({ECX}, 8);
+            AND(32, R(ECX), Imm8(3));
+            SHL(32, R(ECX), Imm8(3));
+            ROR_(32, R(RSCRATCH), R(ECX));
+            RET();
+        }
+        else if (size == 16)
+        {
+            AND(32, R(ABI_PARAM1), Imm32(addressMask));
+            JMP((u8*)NDS::ARM7Read16, true);
+        }
+        else
+            JMP((u8*)NDS::ARM7Read8, true);
     }
 
+    return res;
+}
+
+void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size)
+{
+    if (store)
+        MOV(32, R(ABI_PARAM2), rd);
+    u32 cycles = Num
+        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+    MOV(32, R(ABI_PARAM3), Imm32(cycles));
+    CALL(Num == 0
+        ? MemoryFuncs9[size >> 4][store]
+        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+
     if (!store)
     {
-        MOV(32, R(ECX), MDisp(RSP, 8));
-        ROR_(32, R(RSCRATCH), R(ECX));
+        if (signExtend)
+            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        else
+            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
     }
-
-    RET();
-
-    return res;
 }
 
 OpArg Compiler::A_Comp_GetMemWBOffset()
 {
-    if (!(CurrentInstr.Instr & (1 << 25)))
-        return Imm32(CurrentInstr.Instr & 0xFFF);
+    if (!(CurInstr.Instr & (1 << 25)))
+    {
+        u32 imm = CurInstr.Instr & 0xFFF;
+        return Imm32(imm);
+    }
     else
     {
-        int op = (CurrentInstr.Instr >> 5) & 0x3;
-        int amount = (CurrentInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurrentInstr.A_Reg(0));
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        OpArg rm = MapReg(CurInstr.A_Reg(0));
         bool carryUsed;
+
         return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
     }
 }
 
 void Compiler::A_Comp_MemWB()
-{    
-    OpArg rn = MapReg(CurrentInstr.A_Reg(16));
-    OpArg rd = MapReg(CurrentInstr.A_Reg(12));
-    bool load = CurrentInstr.Instr & (1 << 20);
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+    int size = byte ? 8 : 32;
 
-    MOV(32, R(RSCRATCH2), rn);
-    if (CurrentInstr.Instr & (1 << 24))
+    if (CurInstr.Instr & (1 << 24))
     {
         OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurrentInstr.Instr & (1 << 23))
-            ADD(32, R(RSCRATCH2), offset);
+        if (CurInstr.Instr & (1 << 23))
+            MOV_sum(32, ABI_PARAM1, rn, offset);
         else
-            SUB(32, R(RSCRATCH2), offset);
+        {
+            MOV(32, R(ABI_PARAM1), rn);
+            SUB(32, R(ABI_PARAM1), offset);
+        }
 
-        if (CurrentInstr.Instr & (1 << 21))
-            MOV(32, rn, R(RSCRATCH2));
+        if (CurInstr.Instr & (1 << 21))
+            MOV(32, rn, R(ABI_PARAM1));
     }
-
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles;
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
     else
+        MOV(32, R(ABI_PARAM1), rn);
+
+    if (!(CurInstr.Instr & (1 << 24)))
+    {
+        OpArg offset = A_Comp_GetMemWBOffset();
+
+        if (CurInstr.Instr & (1 << 23))
+            ADD(32, rn, offset);
+        else
+            SUB(32, rn, offset);
+    }
+
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    if (load && CurInstr.A_Reg(12) == 15)
+    {
+        if (byte)
+            printf("!!! LDRB PC %08X\n", R15);
+        else
+        {
+            if (Num == 1)
+                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
+            Comp_JumpTo(rd.GetSimpleReg());
+        }
+    }
+}
+
+void Compiler::A_Comp_MemHalf()
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    OpArg offset = CurInstr.Instr & (1 << 22)
+        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : MapReg(CurInstr.A_Reg(0));
+
+    if (CurInstr.Instr & (1 << 24))
     {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
+        if (CurInstr.Instr & (1 << 23))
+            MOV_sum(32, ABI_PARAM1, rn, offset);
+        else
+        {
+            MOV(32, R(ABI_PARAM1), rn);
+            SUB(32, R(ABI_PARAM1), offset);
+        }
+        
+        if (CurInstr.Instr & (1 << 21))
+            MOV(32, rn, R(ABI_PARAM1));
     }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    else
+        MOV(32, R(ABI_PARAM1), rn);
 
-    if (load)
-        MOV(32, R(RSCRATCH2), R(RSCRATCH));
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    bool load = CurInstr.Instr & (1 << 20);
 
-    if (!(CurrentInstr.Instr & (1 << 24)))
+    bool signExtend = false;
+    int size;
+    if (!load && op == 1)
+        size = 16;
+    else if (load)
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        size = op == 2 ? 8 : 16;
+        signExtend = op > 1;
+    }
 
-        if (CurrentInstr.Instr & (1 << 23))
+    if (!(CurInstr.Instr & (1 << 24)))
+    {
+        if (CurInstr.Instr & (1 << 23))
             ADD(32, rn, offset);
         else
             SUB(32, rn, offset);
     }
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH2));
+    Comp_MemAccess(rd, signExtend, !load, size);
+
+    if (load && CurInstr.A_Reg(12) == 15)
+        printf("!!! MemHalf op PC %08X\n", R15);;
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
-    OpArg ro = MapReg(CurrentInstr.T_Reg(6));
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+    OpArg ro = MapReg(CurInstr.T_Reg(6));
 
-    int op = (CurrentInstr.Instr >> 10) & 0x3;
+    int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
-    
-    MOV(32, R(RSCRATCH2), rb);
-    ADD(32, R(RSCRATCH2), ro);
-
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
-    else
-    {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
-    }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    bool byte = op & 0x1;
+
+    MOV_sum(32, ABI_PARAM1, rb, ro);
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH));
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
 }
 
 void Compiler::T_Comp_MemImm()
 {
-    // TODO: aufräumen!!!
-    OpArg rd = MapReg(CurrentInstr.T_Reg(0));
-    OpArg rb = MapReg(CurrentInstr.T_Reg(3));
-    
-    int op = (CurrentInstr.Instr >> 11) & 0x3;
-    u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4;
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+
+    int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset));
-    u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles);
-    MOV(32, R(RSCRATCH3), Imm32(cycles));
-    MOV(32, R(RSCRATCH), R(RSCRATCH2));
-    SHR(32, R(RSCRATCH), Imm8(24));
-    AND(32, R(RSCRATCH), Imm8(0xF));
-    void** funcArray;
-    if (load)
-        funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9;
-    else
-    {
-        funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; 
-        MOV(32, R(R11), rd);
-    }
-    CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray)));
+    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
+
+    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+    OpArg ro = MapReg(CurInstr.T_Reg(6));
+
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    MOV_sum(32, ABI_PARAM1, rb, ro);
+
+    Comp_MemAccess(rd, signExtend, !load, size);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rb = MapReg(CurInstr.T_Reg(3));
+
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
 
-    if (load)
-        MOV(32, rd, R(RSCRATCH));
+    Comp_MemAccess(rd, false, !load, 16);
 }
 
 }
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 41c46e1..32a9645 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -317,7 +317,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     else
     {
         u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
-        if ((instr & 0xFE000000) == 0xFA000000)
+        if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
 
         if (data & A_ARM9Only && num != 0)
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 2a7edfd..4073536 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -566,6 +566,8 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+    ARMJIT::ResetBlocks();
+
     NDSCart::Reset();
     GBACart::Reset();
     GPU::Reset();
-- 
cgit v1.2.3


From 27cbc821b139b74142630c57f7da11478a052282 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Wed, 10 Jul 2019 00:57:59 +0200
Subject: jit: thumb block transfer working also pc and sp relative loads and
 some refactoring

---
 src/ARMJIT_RegCache.h               | 136 ----------
 src/ARMJIT_RegisterCache.h          | 136 ++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  82 ++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  19 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 515 +++++++++++++++++++++++++++++++-----
 src/ARM_InstrInfo.cpp               |  46 ++--
 6 files changed, 682 insertions(+), 252 deletions(-)
 delete mode 100644 src/ARMJIT_RegCache.h
 create mode 100644 src/ARMJIT_RegisterCache.h

(limited to 'src')

diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h
deleted file mode 100644
index 556d27b..0000000
--- a/src/ARMJIT_RegCache.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef ARMJIT_REGCACHE_H
-#define ARMJIT_REGCACHE_H
-
-#include "ARMJIT.h"
-
-// TODO: replace this in the future
-#include "dolphin/BitSet.h"
-
-#include <assert.h>
-
-namespace ARMJIT
-{
-
-template <typename T, typename Reg>
-class RegCache
-{
-public:
-    RegCache()
-    {}
-
-	RegCache(T* compiler, FetchedInstr instrs[], int instrsCount)
-		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
-    {
-        for (int i = 0; i < 16; i++)
-            Mapping[i] = (Reg)-1;
-    }
-
-    void UnloadRegister(int reg)
-    {
-        assert(Mapping[reg] != -1);
-
-        if (DirtyRegs & (1 << reg))
-            Compiler->SaveReg(reg, Mapping[reg]);
-
-        DirtyRegs &= ~(1 << reg);
-        LoadedRegs &= ~(1 << reg);
-        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
-        Mapping[reg] = (Reg)-1;
-    }
-
-    void LoadRegister(int reg)
-    {
-        assert(Mapping[reg] == -1);
-        for (int i = 0; i < NativeRegsAvailable; i++)
-        {
-            Reg nativeReg = NativeRegAllocOrder[i];
-            if (!(NativeRegsUsed & (1 << nativeReg)))
-            {
-                Mapping[reg] = nativeReg;
-                NativeRegsUsed |= 1 << (int)nativeReg;
-                LoadedRegs |= 1 << reg;
-
-                Compiler->LoadReg(reg, nativeReg);
-
-                return;
-            }
-        }
-
-        assert("Welp!");
-    }
-
-    void Flush()
-    {
-        BitSet16 loadedSet(LoadedRegs);
-        for (int reg : loadedSet)
-            UnloadRegister(reg);
-    }
-
-	void Prepare(int i)
-    {
-        u16 futureNeeded = 0;
-        int ranking[16];
-        for (int j = 0; j < 16; j++)
-            ranking[j] = 0;
-        for (int j = i; j < InstrsCount; j++)
-        {
-            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
-            futureNeeded |= regsNeeded.m_val;
-            for (int reg : regsNeeded)
-                ranking[reg]++;
-        }
-
-        // we'll unload all registers which are never used again
-        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
-        for (int reg : neverNeededAgain)
-            UnloadRegister(reg);
-
-		FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
-        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
-        if (needToBeLoaded != BitSet16(0))
-        {
-            int neededCount = needToBeLoaded.Count();
-            BitSet16 loadedSet(LoadedRegs);
-            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
-            {
-                int leastReg = -1;
-                int rank = 1000;
-                for (int reg : loadedSet)
-                {
-                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
-                    {
-                        leastReg = reg;
-                        rank = ranking[reg];
-                    }
-                }
-
-                assert(leastReg != -1);
-                UnloadRegister(leastReg);
-
-                loadedSet.m_val = LoadedRegs;
-            }
-
-            for (int reg : needToBeLoaded)
-                LoadRegister(reg);
-        }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
-    }
-
-	static const Reg NativeRegAllocOrder[];
-	static const int NativeRegsAvailable;
-
-	Reg Mapping[16];
-	u32 NativeRegsUsed = 0;
-	u16 LoadedRegs = 0;
-	u16 DirtyRegs = 0;
-
-	T* Compiler;
-
-	FetchedInstr* Instrs;
-	int InstrsCount;
-};
-
-}
-
-#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
new file mode 100644
index 0000000..04c1eda
--- /dev/null
+++ b/src/ARMJIT_RegisterCache.h
@@ -0,0 +1,136 @@
+#ifndef ARMJIT_REGCACHE_H
+#define ARMJIT_REGCACHE_H
+
+#include "ARMJIT.h"
+
+// TODO: replace this in the future
+#include "dolphin/BitSet.h"
+
+#include <assert.h>
+
+namespace ARMJIT
+{
+
+template <typename T, typename Reg>
+class RegisterCache
+{
+public:
+    RegisterCache()
+    {}
+
+	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
+    {
+        for (int i = 0; i < 16; i++)
+            Mapping[i] = (Reg)-1;
+    }
+
+    void UnloadRegister(int reg)
+    {
+        assert(Mapping[reg] != -1);
+
+        if (DirtyRegs & (1 << reg))
+            Compiler->SaveReg(reg, Mapping[reg]);
+
+        DirtyRegs &= ~(1 << reg);
+        LoadedRegs &= ~(1 << reg);
+        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
+        Mapping[reg] = (Reg)-1;
+    }
+
+    void LoadRegister(int reg)
+    {
+        assert(Mapping[reg] == -1);
+        for (int i = 0; i < NativeRegsAvailable; i++)
+        {
+            Reg nativeReg = NativeRegAllocOrder[i];
+            if (!(NativeRegsUsed & (1 << nativeReg)))
+            {
+                Mapping[reg] = nativeReg;
+                NativeRegsUsed |= 1 << (int)nativeReg;
+                LoadedRegs |= 1 << reg;
+
+                Compiler->LoadReg(reg, nativeReg);
+
+                return;
+            }
+        }
+
+        assert("Welp!");
+    }
+
+    void Flush()
+    {
+        BitSet16 loadedSet(LoadedRegs);
+        for (int reg : loadedSet)
+            UnloadRegister(reg);
+    }
+
+	void Prepare(int i)
+    {
+        u16 futureNeeded = 0;
+        int ranking[16];
+        for (int j = 0; j < 16; j++)
+            ranking[j] = 0;
+        for (int j = i; j < InstrsCount; j++)
+        {
+            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
+            futureNeeded |= regsNeeded.m_val;
+            for (int reg : regsNeeded)
+                ranking[reg]++;
+        }
+
+        // we'll unload all registers which are never used again
+        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
+        for (int reg : neverNeededAgain)
+            UnloadRegister(reg);
+
+		FetchedInstr Instr = Instrs[i];
+        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
+        if (needToBeLoaded != BitSet16(0))
+        {
+            int neededCount = needToBeLoaded.Count();
+            BitSet16 loadedSet(LoadedRegs);
+            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
+            {
+                int leastReg = -1;
+                int rank = 1000;
+                for (int reg : loadedSet)
+                {
+                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
+                    {
+                        leastReg = reg;
+                        rank = ranking[reg];
+                    }
+                }
+
+                assert(leastReg != -1);
+                UnloadRegister(leastReg);
+
+                loadedSet.m_val = LoadedRegs;
+            }
+
+            for (int reg : needToBeLoaded)
+                LoadRegister(reg);
+        }
+        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+    }
+
+	static const Reg NativeRegAllocOrder[];
+	static const int NativeRegsAvailable;
+
+	Reg Mapping[16];
+	u32 NativeRegsUsed = 0;
+	u16 LoadedRegs = 0;
+	u16 DirtyRegs = 0;
+
+	T* Compiler;
+
+	FetchedInstr* Instrs;
+	int InstrsCount;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index b7358a2..4fe0c70 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -9,20 +9,20 @@ using namespace Gen;
 namespace ARMJIT
 {
 template <>
-const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
+const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
-    RBX, RSI, RDI, R12, R13
+    RBX, RSI, RDI, R12, R13, R14
 #else
-    RBX, R12, R13
+    RBX, R12, R13, R14 // this is sad
 #endif
 };
 template <>
-const int RegCache<Compiler, X64Reg>::NativeRegsAvailable =
+const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
-    5
+    6
 #else
-    3
+    4
 #endif
 ;
 
@@ -39,10 +39,47 @@ Compiler::Compiler()
             MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
         }
     }
+    for (int i = 0; i < 2; i++)
+        for (int j = 0; j < 2; j++)
+        {
+            MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j);
+            MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false);
+            MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
+        }
 
     ResetStart = GetWritableCodePtr();
 }
 
+void* Compiler::Gen_ChangeCPSRRoutine()
+{
+    void* res = (void*)GetWritableCodePtr();
+
+    MOV(32, R(RSCRATCH), R(RCPSR));
+    AND(32, R(RSCRATCH), Imm8(0x1F));
+    CMP(32, R(RSCRATCH), Imm8(0x11));
+    FixupBranch fiq = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x12));
+    FixupBranch irq = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x13));
+    FixupBranch svc = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x17));
+    FixupBranch abt = J_CC(CC_E);
+    CMP(32, R(RSCRATCH), Imm8(0x1B));
+    FixupBranch und = J_CC(CC_E);
+
+    SetJumpTarget(fiq);
+
+    SetJumpTarget(irq);
+
+    SetJumpTarget(svc);
+
+    SetJumpTarget(abt);
+
+    SetJumpTarget(und);
+
+    return res;
+}
+
 DataRegion Compiler::ClassifyAddress(u32 addr)
 {
     if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
@@ -106,12 +143,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
-    XOR(32, R(RCycles), R(RCycles));
 
     LoadCPSR();
 
     // TODO: this is ugly as a whole, do better
-    RegCache = ARMJIT::RegCache<Compiler, X64Reg>(this, instrs, instrsCount);
+    RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
     {
@@ -242,7 +278,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     RegCache.Flush();
     SaveCPSR();
 
-    LEA(32, RAX, MDisp(RCycles, ConstantCycles));
+    MOV(32, R(RAX), Imm32(ConstantCycles));
 
     ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
     RET();
@@ -306,18 +342,20 @@ CompileFunc Compiler::GetCompFunc(int kind)
         NULL, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // STRB
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDR
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // LDRB
+        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         // STRH
         A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRD
-        NULL, NULL, NULL, NULL,
-        // STRD
-        NULL, NULL, NULL, NULL,
+        // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // LDRH
         A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
         // LDRSB
@@ -360,10 +398,14 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
         // LDR/STR half imm offset
         T_Comp_MemImmHalf, T_Comp_MemImmHalf,
-        // branch, etc.
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL,
-        NULL, NULL
+        // LDR/STR sp rel
+        NULL, NULL,
+        // PUSH/POP
+        NULL, NULL, 
+        // LDMIA, STMIA
+        NULL, NULL, 
+        NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL
     };
 
     return Thumb ? T_Comp[kind] : A_Comp[kind];
@@ -376,7 +418,7 @@ void Compiler::Comp_AddCycles_C()
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if (CurInstr.Cond() < 0xE)
-        ADD(32, R(RCycles), Imm8(cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -388,13 +430,15 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (CurInstr.Cond() < 0xE)
-        ADD(32, R(RCycles), Imm8(cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
+    // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert
+    // wird der alte Wert gespeichert
     SaveCPSR();
 
     MOV(64, R(ABI_PARAM1), R(RCPU));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9395a29..a751737 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,7 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
-#include "../ARMJIT_RegCache.h"
+#include "../ARMJIT_RegisterCache.h"
 
 #include <tuple>
 
@@ -12,7 +12,6 @@ namespace ARMJIT
 {
 
 const Gen::X64Reg RCPU = Gen::RBP;
-const Gen::X64Reg RCycles = Gen::R14;
 const Gen::X64Reg RCPSR = Gen::R15;
 
 const Gen::X64Reg RSCRATCH = Gen::EAX;
@@ -72,6 +71,7 @@ private:
 
     void A_Comp_MemWB();
     void A_Comp_MemHalf();
+    void A_Comp_LDM_STM();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -86,8 +86,13 @@ private:
     void T_Comp_MemImm();
     void T_Comp_MemRegHalf();
     void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+    void T_Comp_PUSH_POP();
+    void T_Comp_LDMIA_STMIA();
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -100,6 +105,11 @@ private:
     void* Gen_MemoryRoutine9(bool store, int size);
     void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
+    void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
+    void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
+
+    void* Gen_ChangeCPSRRoutine();
+
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
@@ -122,11 +132,14 @@ private:
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
+    void* MemoryFuncsSeq9[2][2];
+    void* MemoryFuncsSeq7[2][2][2];
+
     bool CPSRDirty = false;
 
     FetchedInstr CurInstr;
 
-    RegCache<Compiler, Gen::X64Reg> RegCache;
+    RegisterCache<Compiler, Gen::X64Reg> RegCache;
 
     bool Thumb;
     u32 Num;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 69746e2..20e1893 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -3,16 +3,6 @@
 #include "../GPU.h"
 #include "../Wifi.h"
 
-namespace NDS
-{
-extern u8* SWRAM_ARM9;
-extern u32 SWRAM_ARM9Mask;
-extern u8* SWRAM_ARM7;
-extern u32 SWRAM_ARM7Mask;
-extern u8 ARM7WRAM[];
-extern u16 ARM7BIOSProt;
-}
-
 using namespace Gen;
 
 namespace ARMJIT
@@ -41,6 +31,49 @@ int squeezePointer(T* ptr)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
     code cycles - ABI_PARAM3
 */
+
+#define CALC_CYCLES_9(numC, numD, scratch) \
+    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
+    CMP(32, R(numC), R(numD)); \
+    CMOVcc(32, numD, R(numC), CC_G); \
+    CMP(32, R(numD), R(scratch)); \
+    CMOVcc(32, scratch, R(numD), CC_G); \
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
+#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
+    if (codeMainRAM) \
+    { \
+        LEA(32, scratch, MRegSum(numD, numC)); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    } \
+    else \
+    { \
+        if (!store) \
+            ADD(32, R(numC), Imm8(1)); \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
+        CMP(32, R(numD), R(numC)); \
+        CMOVcc(32, numC, R(numD), CC_G); \
+        CMP(32, R(numC), R(scratch)); \
+        CMOVcc(32, scratch, R(numC), CC_G); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    }
+#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
+    if (codeMainRAM) \
+    { \
+        if (!store) \
+            ADD(32, R(numD), Imm8(1)); \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
+        CMP(32, R(numD), R(numC)); \
+        CMOVcc(32, numC, R(numD), CC_G); \
+        CMP(32, R(numC), R(scratch)); \
+        CMOVcc(32, scratch, R(numC), CC_G); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    } \
+    else \
+    { \
+        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
+    }
+
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -56,15 +89,10 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     FixupBranch insideITCM = J_CC(CC_B);
 
     // cycle counting!
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0)));
-    LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6));
-    CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-    CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-    CMP(32, R(ABI_PARAM4), R(RSCRATCH));
-    CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G);
-    ADD(32, R(RCycles), R(RSCRATCH));
+    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
+    SHR(32, R(ABI_PARAM4), Imm8(12));
+    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
+    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
 
     if (store)
     {
@@ -101,7 +129,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, R(RCycles), R(ABI_PARAM3));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -120,7 +148,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, R(RCycles), R(ABI_PARAM3));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
@@ -158,28 +186,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
 
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
+    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
 
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
     AND(32, R(RSCRATCH), Imm32(0xFF000000));
     CMP(32, R(RSCRATCH), Imm32(0x02000000));
     FixupBranch outsideMainRAM = J_CC(CC_NE);
-    if (codeMainRAM)
-    {
-        LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3));
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
-    else
-    {
-        if (!store)
-            ADD(32, R(ABI_PARAM3), Imm8(1));
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
-        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
-        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
-        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
+    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
     AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
     if (store)
@@ -205,22 +218,7 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     RET();
 
     SetJumpTarget(outsideMainRAM);
-    if (codeMainRAM)
-    {
-        if (!store)
-            ADD(32, R(ABI_PARAM4), Imm8(1));
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3));
-        CMP(32, R(ABI_PARAM4), R(ABI_PARAM3));
-        CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G);
-        CMP(32, R(ABI_PARAM3), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G);
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
-    else
-    {
-        LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1));
-        ADD(32, R(RCycles), R(RSCRATCH));
-    }
+    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
     if (store)
     {
         if (size > 8)
@@ -257,7 +255,189 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     return res;
 }
 
-void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size)
+#define MEMORY_SEQ_WHILE_COND \
+        if (!store) \
+            MOV(32, currentElement, R(EAX));\
+        if (!preinc) \
+            ADD(32, R(ABI_PARAM1), Imm8(4)); \
+        \
+        SUB(32, R(ABI_PARAM3), Imm8(1)); \
+        J_CC(CC_NZ, repeat);
+
+/*
+    ABI_PARAM1 address
+    ABI_PARAM2 address where registers are stored
+    ABI_PARAM3 how many values to read/write
+    ABI_PARAM4 code cycles
+
+    Dolphin x64CodeEmitter is my favourite assembler
+ */
+void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
+{
+    const u8* zero = GetCodePtr();
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
+    RET();
+
+    void* res = (void*)GetWritableCodePtr();
+
+    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    J_CC(CC_Z, zero);
+
+    PUSH(ABI_PARAM3);
+    PUSH(ABI_PARAM4); // we need you later
+
+    const u8* repeat = GetCodePtr();
+
+    if (preinc)
+        ADD(32, R(ABI_PARAM1), Imm8(4));
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
+    FixupBranch insideDTCM = J_CC(CC_B);
+
+    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
+    FixupBranch insideITCM = J_CC(CC_B);
+
+    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster
+
+    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+    AND(32, R(ABI_PARAM1), Imm8(~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM2), currentElement);
+        CALL((void*)NDS::ARM9Write32);
+    }
+    else
+        CALL((void*)NDS::ARM9Read32);
+    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(12));
+    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
+    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
+
+    FixupBranch finishIt1 = J();
+
+    SetJumpTarget(insideDTCM);
+    AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM4), currentElement);
+        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4));
+    }
+    else
+        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
+    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
+    FixupBranch finishIt2 = J();
+
+    SetJumpTarget(insideITCM);
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM4), currentElement);
+        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
+        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
+        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
+        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+    }
+    else
+        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), Imm32(1));
+    MOV(32, R(ABI_PARAM2), Imm32(1));
+
+    SetJumpTarget(finishIt1);
+    SetJumpTarget(finishIt2);
+
+    POP(ABI_PARAM4);
+    POP(ABI_PARAM3);
+
+    CMP(32, R(ABI_PARAM3), Imm8(1));
+    FixupBranch skipSequential = J_CC(CC_E);
+    SUB(32, R(ABI_PARAM3), Imm8(1));
+    IMUL(32, R(ABI_PARAM3));
+    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
+    SetJumpTarget(skipSequential);
+
+    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
+{
+    const u8* zero = GetCodePtr();
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
+    RET();
+
+    void* res = (void*)GetWritableCodePtr();
+
+    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    J_CC(CC_Z, zero);
+
+    PUSH(ABI_PARAM3);
+    PUSH(ABI_PARAM4); // we need you later
+
+    const u8* repeat = GetCodePtr();
+
+    if (preinc)
+        ADD(32, R(ABI_PARAM1), Imm8(4));
+
+    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8);
+
+    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+    AND(32, R(ABI_PARAM1), Imm8(~3));
+    if (store)
+    {
+        MOV(32, R(ABI_PARAM2), currentElement);
+        CALL((void*)NDS::ARM7Write32);
+    }
+    else
+        CALL((void*)NDS::ARM7Read32);
+    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+
+    MEMORY_SEQ_WHILE_COND
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    SHR(32, R(RSCRATCH), Imm8(15));
+    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
+    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+
+    POP(ABI_PARAM4);
+    POP(ABI_PARAM3);
+
+    CMP(32, R(ABI_PARAM3), Imm8(1));
+    FixupBranch skipSequential = J_CC(CC_E);
+    SUB(32, R(ABI_PARAM3), Imm8(1));
+    IMUL(32, R(ABI_PARAM3));
+    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
+    SetJumpTarget(skipSequential);
+
+    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+    AND(32, R(RSCRATCH), Imm32(0xFF000000));
+    CMP(32, R(RSCRATCH), Imm32(0x02000000));
+    FixupBranch outsideMainRAM = J_CC(CC_NE);
+    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    SetJumpTarget(outsideMainRAM);
+    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
+    RET();
+
+    return res;
+}
+
+#undef CALC_CYCLES_9
+#undef MEMORY_SEQ_WHILE_COND
+
+void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 {
     if (store)
         MOV(32, R(ABI_PARAM2), rd);
@@ -278,6 +458,129 @@ void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int si
     }
 }
 
+s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    int regsCount = regs.Count();
+
+    const u8 userModeOffsets[] =
+    {
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0,
+
+        offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]),
+        offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+
+        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
+        offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0,
+    };
+
+    if (decrement)
+    {
+        MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
+        preinc = !preinc;
+    }
+    else
+        MOV(32, R(ABI_PARAM1), rb);
+
+    MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        u32 cycles = Num
+            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+    MOV(32, R(ABI_PARAM4), Imm32(cycles));
+    if (!store)
+    {
+        SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        MOV(64, R(ABI_PARAM2), R(RSP));
+
+        CALL(Num == 0
+            ? MemoryFuncsSeq9[0][preinc]
+            : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
+
+        for (int reg = 15; reg >= 0; reg--)
+        {
+            if (regs[reg])
+            {
+                if (usermode && reg >= 8 && reg < 15)
+                {
+                    MOV(32, R(RSCRATCH2), R(RCPSR));
+                    AND(32, R(RSCRATCH2), Imm8(0x1F));
+                    // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
+                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                    POP(RSCRATCH);
+                    MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
+                }
+                else if (RegCache.Mapping[reg] == INVALID_REG)
+                {
+                    assert(reg != 15);
+
+                    POP(RSCRATCH);
+                    SaveReg(reg, RSCRATCH);
+                }
+                else
+                {
+                    if (reg != 15)
+                        RegCache.DirtyRegs |= (1 << reg);
+                    POP(MapReg(reg).GetSimpleReg());
+                }
+            }
+        }
+
+        if (regs[15])
+        {
+            if (Num == 1)
+                OR(32, MapReg(15), Imm8(1));
+            Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+        }
+    }
+    else
+    {
+        for (int reg : regs)
+        {
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                MOV(32, R(RSCRATCH), R(RCPSR));
+                AND(32, R(RSCRATCH), Imm8(0x1F));
+                // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
+                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
+                PUSH(RSCRATCH);
+            }
+            else if (RegCache.Mapping[reg] == INVALID_REG)
+            {
+                LoadReg(reg, RSCRATCH);
+                PUSH(RSCRATCH);
+            }
+            else
+                PUSH(MapReg(reg).GetSimpleReg());
+        }
+        MOV(64, R(ABI_PARAM2), R(RSP));
+
+        CALL(Num == 0
+            ? MemoryFuncsSeq9[1][preinc]
+            : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
+
+        ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+    }
+
+    return (regsCount * 4) * (decrement ? -1 : 1);
+}
+
 OpArg Compiler::A_Comp_GetMemWBOffset()
 {
     if (!(CurInstr.Instr & (1 << 25)))
@@ -354,6 +657,25 @@ void Compiler::A_Comp_MemHalf()
         ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
         : MapReg(CurInstr.A_Reg(0));
 
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    bool load = CurInstr.Instr & (1 << 20);
+
+    bool signExtend = false;
+    int size;
+    if (!load)
+    {
+        size = op == 1 ? 16 : 32;
+        load = op == 2;
+    }
+    else if (load)
+    {
+        size = op == 2 ? 8 : 16;
+        signExtend = op > 1;
+    }
+
+    if (size == 32 && Num == 1)
+        return; // NOP
+
     if (CurInstr.Instr & (1 << 24))
     {
         if (CurInstr.Instr & (1 << 23))
@@ -370,19 +692,6 @@ void Compiler::A_Comp_MemHalf()
     else
         MOV(32, R(ABI_PARAM1), rn);
 
-    int op = (CurInstr.Instr >> 5) & 0x3;
-    bool load = CurInstr.Instr & (1 << 20);
-
-    bool signExtend = false;
-    int size;
-    if (!load && op == 1)
-        size = 16;
-    else if (load)
-    {
-        size = op == 2 ? 8 : 16;
-        signExtend = op > 1;
-    }
-
     if (!(CurInstr.Instr & (1 << 24)))
     {
         if (CurInstr.Instr & (1 << 23))
@@ -412,6 +721,24 @@ void Compiler::T_Comp_MemReg()
     Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
 }
 
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = (CurInstr.Instr >> 20) & 1;
+    bool pre = (CurInstr.Instr >> 24) & 1;
+    bool add = (CurInstr.Instr >> 23) & 1;
+    bool writeback = (CurInstr.Instr >> 21) & 1;
+    bool usermode = (CurInstr.Instr >> 22) & 1;
+
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false);
+
+    if (writeback)
+        ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
+}
+
 void Compiler::T_Comp_MemImm()
 {
     OpArg rd = MapReg(CurInstr.T_Reg(0));
@@ -456,4 +783,56 @@ void Compiler::T_Comp_MemImmHalf()
     Comp_MemAccess(rd, false, !load, 16);
 }
 
+void Compiler::T_Comp_LoadPCRel()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+
+    // hopefully this doesn't break
+    u32 val; CurCPU->DataRead32(addr, &val);
+    MOV(32, rd, Imm32(val));
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+
+    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
+
+    Comp_MemAccess(rd, false, !load, 32);
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    OpArg sp = MapReg(13);
+    
+    s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false);
+
+    ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    OpArg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+
+    s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+        ADD(32, rb, Imm8(offset));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 32a9645..c519229 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -25,9 +25,7 @@ enum {
 
     A_Link              = 1 << 10,
 
-    A_LDMSTM            = 1 << 11,
-
-    A_ARM9Only          = 1 << 12,
+    A_UnkOnARM7         = 1 << 11,
 };
 
 #define A_BIOP A_Read16
@@ -97,12 +95,12 @@ const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
 const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
 
-const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ);
+const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ);
 
-const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD);
-const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB);
-const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD);
-const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB);
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
 #define A_STR A_Read12
@@ -144,8 +142,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
 const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
 
-const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM);
+const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -154,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
 const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
 
 const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
-const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM);
-const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG);
-const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS);
-const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR);
-const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC);
+const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC);
 const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
@@ -249,7 +247,7 @@ const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
@@ -320,8 +318,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
 
-        if (data & A_ARM9Only && num != 0)
-            data |= A_BranchAlways | A_Link;
+        if (data & A_UnkOnARM7 && num != 0)
+            data = A_UNK;
+
+        res.Kind = (data >> 13) & 0x1FF;
 
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
@@ -360,14 +360,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SrcRegs |= 1 << 15;
         }
 
-        if (data & A_LDMSTM)
-        {
-            res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15);
-            if (instr & (1 << 21))
-                res.DstRegs |= 1 << ((instr >> 16) & 0xF);
-        }
-
-        res.Kind = (data >> 13) & 0x1FF;
+        if (res.Kind == ak_LDM)
+            res.DstRegs |= instr & (1 << 15); // this is right
 
         return res;
     }
-- 
cgit v1.2.3


From 83bd863361e19bc5456bbaaa3d0ec0df3c1731c0 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 11 Jul 2019 16:22:47 +0200
Subject: jit: branch instructions

---
 src/ARM.cpp                         |  12 +-
 src/ARMJIT.cpp                      |   4 +-
 src/ARMJIT.h                        |   2 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 267 ++++++++++++++++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 185 ++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  30 ++--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  42 +-----
 src/ARM_InstrInfo.cpp               |   6 +-
 src/ARM_InstrInfo.h                 |   1 +
 src/CMakeLists.txt                  |   1 +
 10 files changed, 363 insertions(+), 187 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_Branch.cpp

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index df58ce3..3c2253c 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -564,11 +564,8 @@ void ARMv5::Execute()
             printf("aaarg ungempappter raum %x\n", R[15]);*/
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
-        if (block == NULL)
-            ARMJIT::CompileBlock(this);
-        else
-            Cycles += block();
-
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -650,10 +647,7 @@ void ARMv4::Execute()
             printf("aaarg ungempappter raum %x\n", R[15]);*/
 
         ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
-        if (block == NULL)
-            ARMJIT::CompileBlock(this);
-        else
-            Cycles += block();
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         // TODO optimize this shit!!!
         if (Halted)
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 6afa967..47b425f 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -121,7 +121,7 @@ void DeInit()
 	delete compiler;
 }
 
-void CompileBlock(ARM* cpu)
+CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -171,6 +171,8 @@ void CompileBlock(ARM* cpu)
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
     InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
+
+	return block;
 }
 
 void ResetBlocks()
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 71188f9..45bb4ed 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
 void Init();
 void DeInit();
 
-void CompileBlock(ARM* cpu);
+CompiledBlock CompileBlock(ARM* cpu);
 
 void ResetBlocks();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..fb2acba
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -0,0 +1,267 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+    
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    // it's not completely safe to assume stuff like, which instructions to preload
+    // we'll see how it works out
+
+    u32 newPC;
+    u32 nextInstr[2];
+    u32 cycles = 0;
+    bool setupRegion = false;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        OR(32, R(RCPSR), Imm8(0x20));
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        AND(32, R(RCPSR), Imm32(~0x20));
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 oldregion = R15 >> 24;
+        u32 newregion = addr >> 24;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
+
+        setupRegion = newregion != oldregion;
+        if (setupRegion)
+            cpu9->SetupCodeMem(addr);
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
+                cycles += CurCPU->CodeCycles;
+                nextInstr[1] = cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                nextInstr[0] = cpu9->CodeRead32(addr, true);
+                nextInstr[1] = nextInstr[0] >> 16;
+                cycles += CurCPU->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            nextInstr[0] = cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            nextInstr[1] = cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
+            nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            nextInstr[0] = cpu7->CodeRead32(addr);
+            nextInstr[1] = cpu7->CodeRead32(addr+4);
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+        }
+    }
+
+    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0]));
+    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1]));
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+
+    if (setupRegion)
+    {
+        MOV(32, R(ABI_PARAM1), R(RCPU));
+        MOV(32, R(ABI_PARAM2), Imm32(newPC));
+        CALL((void*)&ARMv5::SetupCodeMem);
+    }
+}
+
+void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
+{
+    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000);
+    bool previouslyDirty = CPSRDirty;
+    SaveCPSR();
+
+    if (restoreCPSR)
+    {
+        if (Thumb || CurInstr.Cond() >= 0xE)
+        {
+            for (int reg : hiRegsLoaded)
+                RegCache.UnloadRegister(reg);
+        }
+        else
+        {
+            // the ugly way...
+            // we only save them, to load and save them again
+            for (int reg : hiRegsLoaded)
+                SaveReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+
+    MOV(64, R(ABI_PARAM1), R(RCPU));
+    MOV(32, R(ABI_PARAM2), R(addr));
+    if (!restoreCPSR)
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    else
+        MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+    if (Num == 0)
+        CALL((void*)&ARMv5::JumpTo);
+    else
+        CALL((void*)&ARMv4::JumpTo);
+    
+    if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
+    {
+        for (int reg : hiRegsLoaded)
+            LoadReg(reg, RegCache.Mapping[reg]);
+    }
+
+    if (previouslyDirty)
+        LoadCPSR();
+    CPSRDirty = previouslyDirty;
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(0));
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+    Comp_JumpTo(rn.GetSimpleReg());
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    FixupBranch skipFailed = J();
+    SetJumpTarget(skipExecute);
+    Comp_AddCycles_C(true);
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+    if (link && Num == 1)
+    {
+        printf("BLX unsupported on ARM7!!!\n");
+        return;
+    }
+
+    OpArg rn = MapReg(CurInstr.A_Reg(3));
+    if (link)
+        MOV(32, MapReg(14), Imm32(R15 - 1));
+    Comp_JumpTo(rn.GetSimpleReg());
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOV(32, MapReg(14), Imm32(R15 + offset));
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    OpArg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    LEA(32, RSCRATCH, MDisp(lr.GetSimpleReg(), offset));
+    MOV(32, lr, Imm32((R15 - 2) | 1));
+    if (Num == 1 || CurInstr.Instr & (1 << 12))
+        OR(32, R(RSCRATCH), Imm8(1));
+    Comp_JumpTo(RSCRATCH);
+}
+
+void Compiler::T_Comp_BL_Merged(FetchedInstr part1)
+{
+    assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1);
+    Comp_AddCycles_C();
+
+    u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9);
+    target += (CurInstr.Instr & 0x7FF) << 1;
+
+    if (Num == 1 || CurInstr.Instr & (1 << 12))
+        target |= 1;
+
+    MOV(32, MapReg(14), Imm32((R15 - 2) | 1));
+    
+    Comp_JumpTo(target);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 4fe0c70..6799a90 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -50,50 +50,6 @@ Compiler::Compiler()
     ResetStart = GetWritableCodePtr();
 }
 
-void* Compiler::Gen_ChangeCPSRRoutine()
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(RCPSR));
-    AND(32, R(RSCRATCH), Imm8(0x1F));
-    CMP(32, R(RSCRATCH), Imm8(0x11));
-    FixupBranch fiq = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x12));
-    FixupBranch irq = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x13));
-    FixupBranch svc = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x17));
-    FixupBranch abt = J_CC(CC_E);
-    CMP(32, R(RSCRATCH), Imm8(0x1B));
-    FixupBranch und = J_CC(CC_E);
-
-    SetJumpTarget(fiq);
-
-    SetJumpTarget(irq);
-
-    SetJumpTarget(svc);
-
-    SetJumpTarget(abt);
-
-    SetJumpTarget(und);
-
-    return res;
-}
-
-DataRegion Compiler::ClassifyAddress(u32 addr)
-{
-    if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase)
-        return dataRegionDTCM;
-    switch (addr & 0xFF000000)
-    {
-        case 0x02000000: return dataRegionMainRAM;
-        case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM;
-        case 0x04000000: return dataRegionIO;
-        case 0x06000000: return dataRegionVRAM;
-    }
-    return dataRegionGeneric;
-}
-
 void Compiler::LoadCPSR()
 {
     assert(!CPSRDirty);
@@ -123,6 +79,29 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
     MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
 }
 
+// invalidates RSCRATCH and RSCRATCH3
+Gen::FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    if (cond >= 0x8)
+    {
+        static_assert(RSCRATCH3 == ECX);
+        MOV(32, R(RSCRATCH3), R(RCPSR));
+        SHR(32, R(RSCRATCH3), Imm8(28));
+        MOV(32, R(RSCRATCH), Imm32(1));
+        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+
+        return J_CC(CC_Z);
+    }
+    else
+    {
+        // could have used a LUT, but then where would be the fun?
+        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
+
+        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+    }
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -140,6 +119,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
+    bool mergedThumbBL = false;
+
     ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
@@ -167,17 +148,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
             }
 
-            if (comp == NULL || CurInstr.Info.Branches())
+            if (comp == NULL)
                 SaveCPSR();
         }
-
-        // run interpreter
-        cpu->CodeCycles = CurInstr.CodeCycles;
-        cpu->R[15] = R15;
-        cpu->CurInstr = CurInstr.Instr;
-        cpu->NextInstr[0] = CurInstr.NextInstr[0];
-        cpu->NextInstr[1] = CurInstr.NextInstr[1];
-
+        
         if (comp != NULL)
             RegCache.Prepare(i);
         else
@@ -185,58 +159,44 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
-            if (comp == NULL)
+            if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1
+                && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2)
+                mergedThumbBL = true;
+            else
             {
-                MOV(64, R(ABI_PARAM1), R(RCPU));
+                u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                    ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                }
+                else if (mergedThumbBL)
+                    T_Comp_BL_Merged(instrs[i - 1]);
+                else
+                    (this->*comp)();
             }
-            else
-                (this->*comp)();
-
-            ARMInterpreter::THUMBInstrTable[icode](cpu);
         }
         else
         {
             u32 cond = CurInstr.Cond();
             if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
             {
-                MOV(64, R(ABI_PARAM1), R(RCPU));
-                ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
-
-                ARMInterpreter::A_BLX_IMM(cpu);
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+                    ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+                }
             }
             else if (cond == 0xF)
-            {
                 Comp_AddCycles_C();
-                cpu->AddCycles_C();
-            }
             else
             {
                 FixupBranch skipExecute;
                 if (cond < 0xE)
-                {
-                    if (cond >= 0x8)
-                    {
-                        static_assert(RSCRATCH3 == ECX);
-                        MOV(32, R(RSCRATCH3), R(RCPSR));
-                        SHR(32, R(RSCRATCH3), Imm8(28));
-                        MOV(32, R(RSCRATCH), Imm32(1));
-                        SHL(32, R(RSCRATCH), R(RSCRATCH3));
-                        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
-
-                        skipExecute = J_CC(CC_Z);
-                    }
-                    else
-                    {
-                        // could have used a LUT, but then where would be the fun?
-                        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
-
-                        skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z);
-                    }
-
-                }
+                    skipExecute = CheckCondition(cond);
 
                 u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
@@ -258,19 +218,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                     SetJumpTarget(skipFailed);
                 }
-
-                if (cpu->CheckCondition(cond))
-                    ARMInterpreter::ARMInstrTable[icode](cpu);
-                else
-                    cpu->AddCycles_C();
             }
         }
 
-        /*
-            we don't need to collect the interpreted cycles,
-            since cpu->Cycles is taken into account by the dispatcher.
-        */
-
         if (comp == NULL && i != instrsCount - 1)
             LoadCPSR();
     }
@@ -367,7 +317,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // LDM/STM
         NULL, NULL,
         // Branch
-        NULL, NULL, NULL, NULL, NULL,
+        A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
         // system stuff
         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
@@ -389,7 +339,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // pc/sp relative
         T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
         // LDR pcrel
-        NULL,
+        T_Comp_LoadPCRel,
         // LDR/STR reg offset
         T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
         // LDR/STR sign extended, half
@@ -399,25 +349,27 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // LDR/STR half imm offset
         T_Comp_MemImmHalf, T_Comp_MemImmHalf,
         // LDR/STR sp rel
-        NULL, NULL,
+        T_Comp_MemSPRel, T_Comp_MemSPRel,
         // PUSH/POP
-        NULL, NULL, 
+        T_Comp_PUSH_POP, T_Comp_PUSH_POP, 
         // LDMIA, STMIA
-        NULL, NULL, 
-        NULL, NULL,
-        NULL, NULL, NULL, NULL, NULL, NULL
+        T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, 
+        // Branch
+        T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, 
+        // Unk, SVC
+        NULL, NULL
     };
 
     return Thumb ? T_Comp[kind] : A_Comp[kind];
 }
 
-void Compiler::Comp_AddCycles_C()
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (CurInstr.Cond() < 0xE)
+    if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
@@ -429,25 +381,10 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
-    if (CurInstr.Cond() < 0xE)
+    if (!Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
 
-void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
-{
-    // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert
-    // wird der alte Wert gespeichert
-    SaveCPSR();
-
-    MOV(64, R(ABI_PARAM1), R(RCPU));
-    MOV(32, R(ABI_PARAM2), R(addr));
-    MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
-    if (Num == 0)
-        CALL((void*)&ARMv5::JumpTo);
-    else
-        CALL((void*)&ARMv4::JumpTo);
-}
-
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a751737..45b488a 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -22,19 +22,6 @@ class Compiler;
 
 typedef void (Compiler::*CompileFunc)();
 
-enum DataRegion
-{
-    dataRegionGeneric, // hey, that's me!
-    dataRegionMainRAM,
-    dataRegionSWRAM,
-    dataRegionVRAM,
-    dataRegionIO,
-    dataRegionExclusive,
-    dataRegionsCount,
-    dataRegionDTCM = dataRegionExclusive,
-    dataRegionWRAM7 = dataRegionExclusive,
-};
-
 class Compiler : public Gen::X64CodeBlock
 {
 public:
@@ -49,8 +36,9 @@ private:
     CompileFunc GetCompFunc(int kind);
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
 
-    void Comp_AddCycles_C();
+    void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
 
     enum
@@ -63,8 +51,6 @@ private:
         opInvertOp2 = 1 << 5,
     };
 
-    DataRegion ClassifyAddress(u32 addr);
-
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
@@ -73,6 +59,9 @@ private:
     void A_Comp_MemHalf();
     void A_Comp_LDM_STM();
 
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
@@ -91,6 +80,13 @@ private:
     void T_Comp_PUSH_POP();
     void T_Comp_LDMIA_STMIA();
 
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged(FetchedInstr prefix);
+
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
     s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
@@ -119,6 +115,8 @@ private:
     void LoadCPSR();
     void SaveCPSR();
 
+    Gen::FixupBranch CheckCondition(u32 cond);
+
     Gen::OpArg MapReg(int reg)
     {
         if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 20e1893..69b324c 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -462,38 +462,10 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
 {
     int regsCount = regs.Count();
 
-    const u8 userModeOffsets[] =
-    {
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0,
-
-        offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]),
-        offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0,
-
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0,
-
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0,
-
-        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
-        offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0,
-    };
-
     if (decrement)
     {
         MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
-        preinc = !preinc;
+        preinc ^= true;
     }
     else
         MOV(32, R(ABI_PARAM1), rb);
@@ -516,16 +488,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         {
             if (regs[reg])
             {
-                if (usermode && reg >= 8 && reg < 15)
+                /*if (usermode && reg >= 8 && reg < 15)
                 {
                     MOV(32, R(RSCRATCH2), R(RCPSR));
                     AND(32, R(RSCRATCH2), Imm8(0x1F));
                     // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
                     POP(RSCRATCH);
                     MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
                 }
-                else if (RegCache.Mapping[reg] == INVALID_REG)
+                else */if (RegCache.Mapping[reg] == INVALID_REG)
                 {
                     assert(reg != 15);
 
@@ -552,16 +524,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
     {
         for (int reg : regs)
         {
-            if (usermode && reg >= 8 && reg < 15)
+            /*if (usermode && reg >= 8 && reg < 15)
             {
                 MOV(32, R(RSCRATCH), R(RCPSR));
                 AND(32, R(RSCRATCH), Imm8(0x1F));
                 // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
+                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
                 MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
                 PUSH(RSCRATCH);
             }
-            else if (RegCache.Mapping[reg] == INVALID_REG)
+            else */if (RegCache.Mapping[reg] == INVALID_REG)
             {
                 LoadReg(reg, RSCRATCH);
                 PUSH(RSCRATCH);
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index c519229..b8dff00 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -255,7 +255,7 @@ const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
-const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
 const u32 T_B = T_BranchAlways | tk(tk_B);
 const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
 const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
@@ -301,6 +301,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.DstRegs |= (1 << 13);
         if (data & T_ReadR15)
             res.SrcRegs |= (1 << 15);
+        if (data & T_WriteR14)
+            res.DstRegs |= (1 << 14);
+        if (data & T_ReadR14)
+            res.SrcRegs |= (1 << 14);
 
         if (data & T_BranchAlways)
             res.DstRegs |= (1 << 15);
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index dcd938b..51dcfa2 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -202,6 +202,7 @@ enum
     tk_POP,
     tk_LDMIA,
     tk_STMIA,
+    
     tk_BCOND,
     tk_BX,
     tk_BLX_REG,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ae04ffb..75fa42c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,6 +54,7 @@ add_library(core STATIC
 	ARMJIT_x64/ARMJIT_Compiler.cpp
 	ARMJIT_x64/ARMJIT_ALU.cpp
 	ARMJIT_x64/ARMJIT_LoadStore.cpp
+	ARMJIT_x64/ARMJIT_Branch.cpp
 
 	dolphin/CommonFuncs.cpp
 	dolphin/x64ABI.cpp
-- 
cgit v1.2.3


From f22521a43d8e3ea51493119f9f285cf265f21416 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 03:43:45 +0200
Subject: jit: LDM/STM finally(!) working + MUL, MLA and CLZ

---
 src/ARM.cpp                         |   7 +++
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  74 +++++++++++++++++++++++
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |   7 +--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 108 +++++++++++++++++++++++++++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  14 ++++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 116 +++++++++++++++++++++++++-----------
 6 files changed, 279 insertions(+), 47 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 3c2253c..baf8468 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -81,8 +81,15 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
+namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
+
 void ARM::Reset()
 {
+    FILE* blabla = fopen("fhhg", "w");
+    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
+        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
+    fclose(blabla);
+
     Cycles = 0;
     Halted = 0;
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index c22751e..cbe67fd 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -223,6 +223,73 @@ void Compiler::A_Comp_MovOp()
         Comp_JumpTo(rd.GetSimpleReg(), S);
 }
 
+void Compiler::A_Comp_CLZ()
+{
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+
+    MOV(32, R(RSCRATCH), Imm32(32));
+    TEST(32, rm, rm);
+    FixupBranch skipZero = J_CC(CC_Z);
+    BSR(32, RSCRATCH, rm);
+    XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH
+    SetJumpTarget(skipZero);
+    MOV(32, rd, R(RSCRATCH));
+}
+
+void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn)
+{
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        BSR(32, RSCRATCH2, R(RSCRATCH3));
+        NOT(32, R(RSCRATCH3));
+        BSR(32, RSCRATCH, R(RSCRATCH3));
+        CMP(32, R(RSCRATCH2), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1);
+    }
+
+    static_assert(EAX == RSCRATCH);
+    MOV(32, R(RSCRATCH), rm);
+    if (add)
+    {
+        IMUL(32, RSCRATCH, rs);
+        LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg()));
+        TEST(32, rd, rd);
+    }
+    else
+    {
+        IMUL(32, RSCRATCH, rs);
+        MOV(32, rd, R(RSCRATCH));
+        TEST(32, R(RSCRATCH), R(RSCRATCH));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+}
+
+void Compiler::A_Comp_MUL_MLA()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn;
+    if (add)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_MulOp(S, add, rd, rm, rs, rn);
+}
+
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
     CPSRDirty = true;
@@ -455,6 +522,13 @@ void Compiler::T_Comp_ALU_Imm8()
     }
 }
 
+void Compiler::T_Comp_MUL()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+    Comp_MulOp(true, false, rd, rd, rs, Imm8(-1));
+}
+
 void Compiler::T_Comp_ALU()
 {
     OpArg rd = MapReg(CurInstr.T_Reg(0));
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index fb2acba..bd01ffb 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -126,17 +126,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
-    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000);
+    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
     {
         if (Thumb || CurInstr.Cond() >= 0xE)
-        {
-            for (int reg : hiRegsLoaded)
-                RegCache.UnloadRegister(reg);
-        }
+            RegCache.Flush();
         else
         {
             // the ugly way...
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 6799a90..8a895d1 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -26,10 +26,14 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+int instructionPopularityARM[ARMInstrInfo::ak_Count];
+
 Compiler::Compiler()
 {
     AllocCodeSpace(1024 * 1024 * 16);
 
+    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
+
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -47,7 +51,88 @@ Compiler::Compiler()
             MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
         }
 
-    ResetStart = GetWritableCodePtr();
+    {
+        // RSCRATCH mode
+        // ABI_PARAM2 reg number
+        // ABI_PARAM3 value in current mode
+        // ret - ABI_PARAM3
+        ReadBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)));
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)));
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)));
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)));
+        RET();
+        SetJumpTarget(und);
+        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
+        RET();
+        }
+    {
+        // RSCRATCH  mode
+        // ABI_PARAM2 reg n
+        // ABI_PARAM3 value
+        // carry flag set if the register isn't banked
+        WriteBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        STC();
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3));
+        CLC();
+        RET();
+        SetJumpTarget(und);
+        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3));
+        CLC();
+        RET();
+    }
+
+    ResetStart = (void*)GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -136,6 +221,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
 
         CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
+        
+        if (!Thumb)
+            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
 
         if (comp == NULL || i == instrsCount - 1)
         {
@@ -287,9 +375,9 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // Mul
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
-        NULL, NULL, NULL, NULL, NULL,
+        A_Comp_CLZ, NULL, NULL, NULL, NULL,
         // STR
         A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
         //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -315,7 +403,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // swap
         NULL, NULL,
         // LDM/STM
-        NULL, NULL,
+        A_Comp_LDM_STM, A_Comp_LDM_STM,
         // Branch
         A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
         // system stuff
@@ -333,7 +421,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
         T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU,
+        T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU,
         // hi reg
         T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
         // pc/sp relative
@@ -387,4 +475,14 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         ConstantCycles += cycles;
 }
 
+void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+    
+    LEA(32, RSCRATCH, MDisp(i, add + cycles));
+    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 45b488a..89dfe28 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -40,6 +40,7 @@ private:
 
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
+    void Comp_AddCycles_CI(Gen::X64Reg i, int add);
 
     enum
     {
@@ -55,6 +56,10 @@ private:
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
 
+    void A_Comp_MUL_MLA();
+
+    void A_Comp_CLZ();
+    
     void A_Comp_MemWB();
     void A_Comp_MemHalf();
     void A_Comp_LDM_STM();
@@ -62,11 +67,13 @@ private:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
     void T_Comp_ALU();
     void T_Comp_ALU_HiReg();
+    void T_Comp_MUL();
 
     void T_Comp_RelAddr();
     void T_Comp_AddSP();
@@ -88,7 +95,7 @@ private:
     void T_Comp_BL_Merged(FetchedInstr prefix);
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
-    s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -96,6 +103,8 @@ private:
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
     void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed);
 
+    void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn);
+
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
     void* Gen_MemoryRoutine9(bool store, int size);
@@ -133,6 +142,9 @@ private:
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
 
+    void* ReadBanked;
+    void* WriteBanked;
+
     bool CPSRDirty = false;
 
     FetchedInstr CurInstr;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 69b324c..8fbcafd 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,7 +1,5 @@
 #include "ARMJIT_Compiler.h"
 
-#include "../GPU.h"
-#include "../Wifi.h"
 
 using namespace Gen;
 
@@ -362,7 +360,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     CMP(32, R(ABI_PARAM3), Imm8(1));
     FixupBranch skipSequential = J_CC(CC_E);
     SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, R(ABI_PARAM3));
+    IMUL(32, RSCRATCH, R(ABI_PARAM3));
     ADD(32, R(ABI_PARAM2), R(RSCRATCH));
     SetJumpTarget(skipSequential);
 
@@ -413,10 +411,11 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     POP(ABI_PARAM4);
     POP(ABI_PARAM3);
 
+    // TODO: optimise this
     CMP(32, R(ABI_PARAM3), Imm8(1));
     FixupBranch skipSequential = J_CC(CC_E);
     SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, R(ABI_PARAM3));
+    IMUL(32, RSCRATCH, R(ABI_PARAM3));
     ADD(32, R(ABI_PARAM2), R(RSCRATCH));
     SetJumpTarget(skipSequential);
 
@@ -458,25 +457,35 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
     }
 }
 
-s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+void printStuff2(u32 a, u32 b)
 {
+    printf("b %x %x\n", a, b);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    FILE* f;
+    const u8* start = GetCodePtr();
+
     int regsCount = regs.Count();
 
     if (decrement)
     {
-        MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
+        MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
         preinc ^= true;
     }
     else
-        MOV(32, R(ABI_PARAM1), rb);
+        MOV(32, R(ABI_PARAM1), MapReg(rn));
+
+    s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        u32 cycles = Num
+    u32 cycles = Num
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
@@ -484,20 +493,29 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
             ? MemoryFuncsSeq9[0][preinc]
             : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
 
+        bool firstUserMode = true;
         for (int reg = 15; reg >= 0; reg--)
         {
             if (regs[reg])
             {
-                /*if (usermode && reg >= 8 && reg < 15)
+                if (usermode && reg >= 8 && reg < 15)
                 {
-                    MOV(32, R(RSCRATCH2), R(RCPSR));
-                    AND(32, R(RSCRATCH2), Imm8(0x1F));
-                    // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
-                    POP(RSCRATCH);
-                    MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
+                    if (firstUserMode)
+                    {
+                        MOV(32, R(RSCRATCH), R(RCPSR));
+                        AND(32, R(RSCRATCH), Imm8(0x1F));
+                        firstUserMode = false;
+                    }
+                    MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                    POP(ABI_PARAM3);
+                    CALL(WriteBanked);
+                    FixupBranch sucessfulWritten = J_CC(CC_NC);
+                    if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg))
+                        MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
+                    SaveReg(reg, ABI_PARAM3);
+                    SetJumpTarget(sucessfulWritten);
                 }
-                else */if (RegCache.Mapping[reg] == INVALID_REG)
+                else if (RegCache.Mapping[reg] == INVALID_REG)
                 {
                     assert(reg != 15);
 
@@ -516,32 +534,48 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         if (regs[15])
         {
             if (Num == 1)
-                OR(32, MapReg(15), Imm8(1));
+            {
+                if (Thumb)
+                    OR(32, MapReg(15), Imm8(1));
+                else
+                    AND(32, MapReg(15), Imm8(0xFE));
+            }
             Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
         }
     }
     else
     {
+        bool firstUserMode = true;
         for (int reg : regs)
         {
-            /*if (usermode && reg >= 8 && reg < 15)
+            if (usermode && reg >= 8 && reg < 15)
             {
-                MOV(32, R(RSCRATCH), R(RCPSR));
-                AND(32, R(RSCRATCH), Imm8(0x1F));
-                // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
-                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8)));
-                MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
-                PUSH(RSCRATCH);
+                if (firstUserMode)
+                {
+                    MOV(32, R(RSCRATCH), R(RCPSR));
+                    AND(32, R(RSCRATCH), Imm8(0x1F));
+                    firstUserMode = false;
+                }
+                if (RegCache.Mapping[reg] == INVALID_REG)
+                    LoadReg(reg, ABI_PARAM3);
+                else
+                    MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg]));
+                MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                CALL(ReadBanked);
+                PUSH(ABI_PARAM3);
             }
-            else */if (RegCache.Mapping[reg] == INVALID_REG)
+            else if (RegCache.Mapping[reg] == INVALID_REG)
             {
                 LoadReg(reg, RSCRATCH);
                 PUSH(RSCRATCH);
             }
             else
+            {
                 PUSH(MapReg(reg).GetSimpleReg());
+            }
         }
         MOV(64, R(ABI_PARAM2), R(RSP));
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
         CALL(Num == 0
             ? MemoryFuncsSeq9[1][preinc]
@@ -550,7 +584,14 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei
         ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
     }
 
-    return (regsCount * 4) * (decrement ? -1 : 1);
+    if (usermode && !store)
+    {
+        f= fopen("ldm", "a");
+        fwrite(start, GetCodePtr() - start, 1, f);
+        fclose(f);
+    }
+
+    return offset;
 }
 
 OpArg Compiler::A_Comp_GetMemWBOffset()
@@ -697,16 +738,20 @@ void Compiler::A_Comp_LDM_STM()
 {
     BitSet16 regs(CurInstr.Instr & 0xFFFF);
 
-    bool load = (CurInstr.Instr >> 20) & 1;
-    bool pre = (CurInstr.Instr >> 24) & 1;
-    bool add = (CurInstr.Instr >> 23) & 1;
-    bool writeback = (CurInstr.Instr >> 21) & 1;
-    bool usermode = (CurInstr.Instr >> 22) & 1;
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
 
     OpArg rn = MapReg(CurInstr.A_Reg(16));
 
-    s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false);
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
 
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
     if (writeback)
         ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
 }
@@ -789,8 +834,7 @@ void Compiler::T_Comp_PUSH_POP()
     }
 
     OpArg sp = MapReg(13);
-    
-    s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
 
     ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
 }
@@ -801,7 +845,7 @@ void Compiler::T_Comp_LDMIA_STMIA()
     OpArg rb = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false);
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
 
     if (!load || !regs[CurInstr.T_Reg(8)])
         ADD(32, rb, Imm8(offset));
-- 
cgit v1.2.3


From 9336fcbbe66be0b65f036f26899e413be54c3491 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 16:42:42 +0200
Subject: jit: SMULL and SMLAL

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp      | 56 ++++++++++++++++++++++++++++++++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 +
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index cbe67fd..4afafed 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -290,6 +290,59 @@ void Compiler::A_Comp_MUL_MLA()
     Comp_MulOp(S, add, rd, rm, rs, rn);
 }
 
+void Compiler::A_Comp_SMULL_SMLAL()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn = MapReg(CurInstr.A_Reg(12));
+
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        BSR(32, RSCRATCH2, R(RSCRATCH3));
+        NOT(32, R(RSCRATCH3));
+        BSR(32, RSCRATCH, R(RSCRATCH3));
+        CMP(32, R(RSCRATCH2), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, 2);
+    }
+
+    MOVSX(64, 32, RSCRATCH2, rm);
+    MOVSX(64, 32, RSCRATCH3, rs);
+    if (add)
+    {
+        MOV(32, R(RSCRATCH), rd);
+        SHL(64, R(RSCRATCH), Imm8(32));
+        OR(64, R(RSCRATCH), rn);
+
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH));
+    }
+    else
+    {
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        if (S)
+            TEST(64, R(RSCRATCH2), R(RSCRATCH2));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+
+    MOV(32, rn, R(RSCRATCH2));
+    SHR(64, R(RSCRATCH2), Imm8(32));
+    MOV(32, rd, R(RSCRATCH2));
+}
+
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
     CPSRDirty = true;
@@ -302,9 +355,6 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
     }
 
-    if (carryUsed == 983298)
-        printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr);
-
     SETcc(CC_S, R(RSCRATCH));
     SETcc(CC_Z, R(RSCRATCH3));
     LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 8a895d1..b6dd529 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -375,7 +375,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         // CMN
         A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
         // Mul
-        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL,
         // ARMv5 stuff
         A_Comp_CLZ, NULL, NULL, NULL, NULL,
         // STR
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 89dfe28..f9bc227 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -57,6 +57,7 @@ private:
     void A_Comp_CmpOp();
 
     void A_Comp_MUL_MLA();
+    void A_Comp_SMULL_SMLAL();
 
     void A_Comp_CLZ();
     
-- 
cgit v1.2.3


From 24aff49ae496b2401039b09e120dd0fcbd7b8e9e Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 12 Jul 2019 17:01:10 +0200
Subject: jit: fix wrongly placed const

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index b6dd529..e043f58 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -328,7 +328,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
 {
     // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
     // see ARMInstrInfo.h for the order
-    const CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+    CompileFunc const A_Comp[ARMInstrInfo::ak_Count] =
     {
         // AND
         A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
@@ -410,7 +410,7 @@ CompileFunc Compiler::GetCompFunc(int kind)
         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     };
 
-    const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+    CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = {
         // Shift imm
         T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
         // Three operand ADD/SUB
-- 
cgit v1.2.3


From 0ff79ea2ad645f85f3a4878be3717ffda44f9cbe Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 02:37:32 +0200
Subject: jit: fix linux

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  48 +++---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 288 +++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |   8 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  15 +-
 src/dolphin/Log.h                   |  13 +-
 src/dolphin/MemoryUtil.cpp          |  13 +-
 7 files changed, 193 insertions(+), 194 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 4afafed..013f54c 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -154,13 +154,13 @@ void Compiler::A_Comp_Arith()
     switch (op)
     {
     case 0x0: // AND
-        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0x1: // EOR
-        Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0x2: // SUB
-        Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
         break;
     case 0x3: // RSB
         if (op2.IsZero())
@@ -172,25 +172,25 @@ void Compiler::A_Comp_Arith()
                 Comp_RetriveFlags(true, true, false);
         }
         else
-            Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+            Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
         break;
     case 0x4: // ADD
-        Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
         break;
     case 0x5: // ADC
-        Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
+        Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
         break;
     case 0x6: // SBC
-        Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
         break;
     case 0x7: // RSC
-        Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
+        Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
         break;
     case 0xC: // ORR
-        Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
         break;
     case 0xE: // BIC
-        Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
         break;
     default:
         assert("unimplemented");
@@ -392,11 +392,11 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     {
         void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL;
         if (op == 0)
-            shiftOp = SHL;
+            shiftOp = &Compiler::SHL;
         else if (op == 1)
-            shiftOp = SHR;
+            shiftOp = &Compiler::SHR;
         else if (op == 2)
-            shiftOp = SAR;
+            shiftOp = &Compiler::SAR;
 
         CMP(32, R(ECX), Imm8(32));
         FixupBranch lt32 = J_CC(CC_L);
@@ -539,9 +539,9 @@ void Compiler::T_Comp_AddSub_()
     Comp_AddCycles_C();
 
     if (op & 1)
-        Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
-        Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
 }
 
 void Compiler::T_Comp_ALU_Imm8()
@@ -564,10 +564,10 @@ void Compiler::T_Comp_ALU_Imm8()
         Comp_CmpOp(2, rd, imm, false);
         return;
     case 0x2:
-        Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
         return;
     case 0x3:
-        Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
         return;
     }
 }
@@ -594,10 +594,10 @@ void Compiler::T_Comp_ALU()
     switch (op)
     {
     case 0x0: // AND
-        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0x1: // EOR
-        Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0x2:
     case 0x3:
@@ -613,10 +613,10 @@ void Compiler::T_Comp_ALU()
         }
         return;
     case 0x5: // ADC
-        Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
         return;
     case 0x6: // SBC
-        Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
         return;
     case 0x8: // TST
         Comp_CmpOp(0, rd, rs, false);
@@ -634,10 +634,10 @@ void Compiler::T_Comp_ALU()
         Comp_CmpOp(3, rd, rs, false);
         return;
     case 0xC: // ORR
-        Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
         return;
     case 0xE: // BIC
-        Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
         return;
     case 0xF: // MVN
         if (rd != rs)
@@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg()
     switch (op)
     {
     case 0x0: // ADD
-        Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
         break;
     case 0x1: // CMP
         Comp_CmpOp(2, rdMapped, rs, false);
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index bd01ffb..05c8ec6 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -118,7 +118,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
     if (setupRegion)
     {
-        MOV(32, R(ABI_PARAM1), R(RCPU));
+        MOV(64, R(ABI_PARAM1), R(RCPU));
         MOV(32, R(ABI_PARAM2), Imm32(newPC));
         CALL((void*)&ARMv5::SetupCodeMem);
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index e043f58..2b7ccd2 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,6 +4,12 @@
 
 #include <assert.h>
 
+#ifdef _WIN32
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 using namespace Gen;
 
 namespace ARMJIT
@@ -28,9 +34,34 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 
 int instructionPopularityARM[ARMInstrInfo::ak_Count];
 
+/*
+    We'll repurpose this .bss memory
+
+ */
+u8 CodeMemory[1024 * 1024 * 32];
+
 Compiler::Compiler()
 {
-    AllocCodeSpace(1024 * 1024 * 16);
+#ifdef _WIN32
+#else
+    u64 pagesize = sysconf(_SC_PAGE_SIZE);
+#endif
+
+    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
+    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
+
+#ifdef _WIN32
+#else
+    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+#endif
+
+    region = pageAligned;
+    region_size = alignedSize;
+    total_region_size = region_size;
+
+    ClearCodeSpace();
+
+    SetCodePtr(pageAligned);
 
     memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
 
@@ -187,6 +218,124 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
     }
 }
 
+#define F(x) &Compiler::x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // EOR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SUB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADD
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SBC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ORR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MOV
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // BIC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MVN
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // TST
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // TEQ
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMP
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMN
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // Mul
+    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL,
+    // ARMv5 stuff
+    F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
+    // STR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    // LDRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSB
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // swap
+    NULL, NULL,
+    // LDM/STM
+    F(A_Comp_LDM_STM), F(A_Comp_LDM_STM),
+    // Branch
+    F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
+    // system stuff
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+};
+
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+    // Shift imm
+    F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm),
+    // Three operand ADD/SUB
+    F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_),
+    // 8 bit imm
+    F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8),
+    // general ALU
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU),
+    // hi reg
+    F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg),
+    // pc/sp relative
+    F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP),
+    // LDR pcrel
+    F(T_Comp_LoadPCRel),
+    // LDR/STR reg offset
+    F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg),
+    // LDR/STR sign extended, half
+    F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf),
+    // LDR/STR imm offset
+    F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm),
+    // LDR/STR half imm offset
+    F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf),
+    // LDR/STR sp rel
+    F(T_Comp_MemSPRel), F(T_Comp_MemSPRel),
+    // PUSH/POP
+    F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), 
+    // LDMIA, STMIA
+    F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), 
+    // Branch
+    F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
+    // Unk, SVC
+    NULL, NULL
+};
+#undef F
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
@@ -206,7 +355,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     bool mergedThumbBL = false;
 
-    ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
+    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
 
@@ -220,8 +369,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
-        CompileFunc comp = GetCompFunc(CurInstr.Info.Kind);
-        
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
         if (!Thumb)
             instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
 
@@ -318,139 +469,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     MOV(32, R(RAX), Imm32(ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16);
+    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
     RET();
 
     return res;
 }
 
-CompileFunc Compiler::GetCompFunc(int kind)
-{
-    // this might look like waste of space, so many repeatitions, but it's invaluable for debugging.
-    // see ARMInstrInfo.h for the order
-    CompileFunc const A_Comp[ARMInstrInfo::ak_Count] =
-    {
-        // AND
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // EOR
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // SUB
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // RSB
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ADD
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ADC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // SBC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // RSC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // ORR
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // MOV
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        // BIC
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith,
-        // MVN
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp,
-        // TST
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // TEQ
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // CMP
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // CMN
-        A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp,
-        // Mul
-        A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL,
-        // ARMv5 stuff
-        A_Comp_CLZ, NULL, NULL, NULL, NULL,
-        // STR
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        // STRB
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // LDR
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // LDRB
-        //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB,
-        // STRH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-        // LDRH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRSB
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // LDRSH
-        A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf,
-        // swap
-        NULL, NULL,
-        // LDM/STM
-        A_Comp_LDM_STM, A_Comp_LDM_STM,
-        // Branch
-        A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg,
-        // system stuff
-        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-    };
-
-    CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = {
-        // Shift imm
-        T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm,
-        // Three operand ADD/SUB
-        T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_,
-        // 8 bit imm
-        T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8,
-        // general ALU
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU,
-        T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU,
-        // hi reg
-        T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg,
-        // pc/sp relative
-        T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP,
-        // LDR pcrel
-        T_Comp_LoadPCRel,
-        // LDR/STR reg offset
-        T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg,
-        // LDR/STR sign extended, half
-        T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf,
-        // LDR/STR imm offset
-        T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm,
-        // LDR/STR half imm offset
-        T_Comp_MemImmHalf, T_Comp_MemImmHalf,
-        // LDR/STR sp rel
-        T_Comp_MemSPRel, T_Comp_MemSPRel,
-        // PUSH/POP
-        T_Comp_PUSH_POP, T_Comp_PUSH_POP, 
-        // LDMIA, STMIA
-        T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, 
-        // Branch
-        T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, 
-        // Unk, SVC
-        NULL, NULL
-    };
-
-    return Thumb ? T_Comp[kind] : A_Comp[kind];
-}
-
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index f9bc227..e04f96a 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -6,8 +6,6 @@
 #include "../ARMJIT.h"
 #include "../ARMJIT_RegisterCache.h"
 
-#include <tuple>
-
 namespace ARMJIT
 {
 
@@ -18,9 +16,6 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
-class Compiler;
-
-typedef void (Compiler::*CompileFunc)();
 
 class Compiler : public Gen::X64CodeBlock
 {
@@ -32,8 +27,7 @@ public:
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
 
-private:
-    CompileFunc GetCompFunc(int kind);
+    typedef void (Compiler::*CompileFunc)();
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
     void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 8fbcafd..15a40f8 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -464,9 +464,6 @@ void printStuff2(u32 a, u32 b)
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
-    FILE* f;
-    const u8* start = GetCodePtr();
-
     int regsCount = regs.Count();
 
     if (decrement)
@@ -482,11 +479,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     u32 cycles = Num
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
+
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
         CALL(Num == 0
@@ -581,14 +579,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? MemoryFuncsSeq9[1][preinc]
             : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
 
-        ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
-    }
-
-    if (usermode && !store)
-    {
-        f= fopen("ldm", "a");
-        fwrite(start, GetCodePtr() - start, 1, f);
-        fclose(f);
+        ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
     }
 
     return offset;
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
index 21e69a5..a7f4b6a 100644
--- a/src/dolphin/Log.h
+++ b/src/dolphin/Log.h
@@ -4,12 +4,13 @@
 
 #include <stdio.h>
 
-#define PanicAlert(msg) \
-    do \
-    { \
-        printf("%s\n", msg); \
-        Crash(); \
-    } while (false)
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
 
 #define DYNA_REC 0
 
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
index 01cb897..7273a8a 100644
--- a/src/dolphin/MemoryUtil.cpp
+++ b/src/dolphin/MemoryUtil.cpp
@@ -6,15 +6,9 @@
 #include <cstdlib>
 #include <string>
 
-#define PanicAlert(fmt, ...) \
-  do \
-  { \
-    printf(fmt "\n", ## __VA_ARGS__); \
-    abort(); \
-  } while (false)
-
 #include "../types.h"
 #include "CommonFuncs.h"
+#include "Log.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -39,8 +33,6 @@ namespace Common
 
 void* AllocateExecutableMemory(size_t size)
 {
-  printf("c\n");
-
 #if defined(_WIN32)
   void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
 #else
@@ -50,13 +42,10 @@ void* AllocateExecutableMemory(size_t size)
   if (ptr == MAP_FAILED)
     ptr = nullptr;
 #endif
-  printf("a\n");
 
   if (ptr == nullptr)
     PanicAlert("Failed to allocate executable memory");
 
-  printf("b\n");
-
   return ptr;
 }
 
-- 
cgit v1.2.3


From d13d625f7363449c3fdc041b0a22005b92c83229 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 04:33:36 +0200
Subject: jit: make everything configurable

---
 src/ARM.cpp                            | 127 ++++++++++++++++++++++++++++-----
 src/ARM.h                              |   3 +
 src/ARMJIT.cpp                         |  21 ++++--
 src/ARMJIT.h                           |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp     |  14 ++--
 src/ARMJIT_x64/ARMJIT_Compiler.h       |   2 +
 src/Config.cpp                         |   6 ++
 src/Config.h                           |   3 +
 src/NDS.cpp                            |  26 ++++++-
 src/frontend/qt_sdl/PlatformConfig.cpp |   1 +
 10 files changed, 171 insertions(+), 34 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index baf8468..1cd4bb2 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -532,7 +532,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -565,14 +565,8 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
-        
+        }
+ 
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -597,6 +591,58 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+void ARMv5::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(0))
+        {
+            Halted = 0;
+            if (NDS::IME[0] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM9Timestamp < NDS::ARM9Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(0, instrAddr))
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            printf("ARMv5 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            {
+                NDS::ARM9Timestamp = NDS::ARM9Target;
+            }
+            break;
+        }
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
+
 void ARMv4::Execute()
 {
     if (Halted)
@@ -620,7 +666,7 @@ void ARMv4::Execute()
 
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -648,13 +694,7 @@ void ARMv4::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        }
 
         // TODO optimize this shit!!!
         if (Halted)
@@ -679,3 +719,56 @@ void ARMv4::Execute()
     if (Halted == 2)
         Halted = 0;
 }
+
+void ARMv4::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(1))
+        {
+            Halted = 0;
+            if (NDS::IME[1] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM7Timestamp < NDS::ARM7Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(1, instrAddr))
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            printf("ARMv4 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        // TODO optimize this shit!!!
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            {
+                NDS::ARM7Timestamp = NDS::ARM7Target;
+            }
+            break;
+        }
+
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index e0832e2..3b01ef3 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,6 +52,7 @@ public:
     }
 
     virtual void Execute() = 0;
+    virtual void ExecuteJIT() = 0;
 
     bool CheckCondition(u32 code)
     {
@@ -159,6 +160,7 @@ public:
     void DataAbort();
 
     void Execute();
+    void ExecuteJIT();
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -281,6 +283,7 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+    void ExecuteJIT();
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 47b425f..e8e6be0 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,8 @@
 
 #include <string.h>
 
+#include "Config.h"
+
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
 namespace ARMJIT
@@ -125,18 +127,21 @@ CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
-    FetchedInstr instrs[12];
+	if (Config::JIT_MaxBlockSize < 1)
+		Config::JIT_MaxBlockSize = 1;
+	if (Config::JIT_MaxBlockSize > 32)
+		Config::JIT_MaxBlockSize = 32;
+
+    FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 r15Initial = cpu->R[15];
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
-    //printf("block %x %d\n", r15, thumb);
     do
     {
         r15 += thumb ? 2 : 4;
 
         instrs[i].Instr = nextInstr[0];
-        //printf("%x %x\n", instrs[i].Instr, r15);
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
 
         if (cpu->Num == 0)
@@ -166,16 +171,16 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
-    } while(!instrs[i - 1].Info.Branches() && i < 10);
+    } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize);
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
+    InsertBlock(cpu->Num, blockAddr, block);
 
 	return block;
 }
 
-void ResetBlocks()
+void InvalidateBlockCache()
 {
 	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
 	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
@@ -185,6 +190,8 @@ void ResetBlocks()
 	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
 	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
 	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+
+	compiler->Reset();
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 45bb4ed..004256c 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -111,7 +111,7 @@ void DeInit();
 
 CompiledBlock CompileBlock(ARM* cpu);
 
-void ResetBlocks();
+void InvalidateBlockCache();
 
 }
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 2b7ccd2..fe23859 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -336,13 +336,15 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 };
 #undef F
 
+void Compiler::Reset()
+{
+    SetCodePtr((u8*)ResetStart);
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
-    {
-        ResetBlocks();
-        SetCodePtr((u8*)ResetStart);
-    }
+        InvalidateBlockCache();
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
@@ -355,7 +357,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     bool mergedThumbBL = false;
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
 
@@ -469,7 +471,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     MOV(32, R(RAX), Imm32(ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
     return res;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index e04f96a..cd58012 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -22,6 +22,8 @@ class Compiler : public Gen::X64CodeBlock
 public:
     Compiler();
 
+    void Reset();
+
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
diff --git a/src/Config.cpp b/src/Config.cpp
index 5745f34..5c0892a 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,6 +37,9 @@ char DSiBIOS7Path[1024];
 char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
+bool JIT_Enable = false;
+int JIT_MaxBlockSize = 12;
+
 ConfigEntry ConfigFile[] =
 {
     {"BIOS9Path", 1, BIOS9Path, 0, "", 1023},
@@ -48,6 +51,9 @@ ConfigEntry ConfigFile[] =
     {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023},
     {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023},
 
+    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+
     {"", -1, NULL, 0, NULL, 0}
 };
 
diff --git a/src/Config.h b/src/Config.h
index 3947598..9dda157 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -51,6 +51,9 @@ extern char DSiBIOS7Path[1024];
 extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
+extern bool JIT_Enable;
+extern int JIT_MaxBlockSize;
+
 }
 
 #endif // CONFIG_H
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 4073536..cb85d13 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -566,7 +566,7 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
-    ARMJIT::ResetBlocks();
+    ARMJIT::InvalidateBlockCache();
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -794,6 +794,11 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+    if (!file->Saving)
+    {
+        ARMJIT::InvalidateBlockCache();
+    }
+
     return true;
 }
 
@@ -884,6 +889,7 @@ void RunSystem(u64 timestamp)
     }
 }
 
+template <bool EnableJIT>
 u32 RunFrame()
 {
     FrameStartTimestamp = SysTimestamp;
@@ -917,7 +923,10 @@ u32 RunFrame()
         }
         else
         {
-            ARM9->Execute();
+            if (EnableJIT)
+                ARM9->ExecuteJIT();
+            else
+                ARM9->Execute();
         }
 
         RunTimers(0);
@@ -940,7 +949,10 @@ u32 RunFrame()
             }
             else
             {
-                ARM7->Execute();
+                if (EnableJIT)
+                    ARM7->ExecuteJIT();
+                else
+                    ARM7->Execute();
             }
 
             RunTimers(1);
@@ -970,6 +982,14 @@ u32 RunFrame()
     return GPU::TotalScanlines;
 }
 
+u32 RunFrame()
+{
+    if (Config::JIT_Enable)
+        return RunFrame<true>();
+    else
+        return RunFrame<false>();
+}
+
 void Reschedule(u64 target)
 {
     if (CurCPU == 0)
diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp
index 06128d7..bfb3f97 100644
--- a/src/frontend/qt_sdl/PlatformConfig.cpp
+++ b/src/frontend/qt_sdl/PlatformConfig.cpp
@@ -72,6 +72,7 @@ char MicWavPath[1024];
 
 char LastROMFolder[1024];
 
+bool EnableJIT;
 
 ConfigEntry PlatformConfigFile[] =
 {
-- 
cgit v1.2.3


From fc82ca1a97ce8304bf563ca53187227e505eb54e Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 18:08:42 +0200
Subject: jit: remove unnessary files from dolphin

---
 src/dolphin/CodeBlock.h    |  29 +-------
 src/dolphin/MemoryUtil.cpp | 182 ---------------------------------------------
 src/dolphin/MemoryUtil.h   |  22 ------
 3 files changed, 1 insertion(+), 232 deletions(-)
 delete mode 100644 src/dolphin/MemoryUtil.cpp
 delete mode 100644 src/dolphin/MemoryUtil.h

(limited to 'src')

diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 1434297..31a8d93 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -49,15 +49,6 @@ public:
   CodeBlock(CodeBlock&&) = delete;
   CodeBlock& operator=(CodeBlock&&) = delete;
 
-  // Call this before you generate any code.
-  void AllocCodeSpace(size_t size)
-  {
-    region_size = size;
-    total_region_size = size;
-    region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
-    T::SetCodePtr(region);
-  }
-
   // Always clear code space with breakpoints, so that if someone accidentally executes
   // uninitialized, it just breaks into the debugger.
   void ClearCodeSpace()
@@ -66,26 +57,8 @@ public:
     ResetCodePtr();
   }
 
-  // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
-  void FreeCodeSpace()
-  {
-    ASSERT(!m_is_child);
-    Common::FreeMemoryPages(region, total_region_size);
-    region = nullptr;
-    region_size = 0;
-    total_region_size = 0;
-    for (CodeBlock* child : m_children)
-    {
-      child->region = nullptr;
-      child->region_size = 0;
-      child->total_region_size = 0;
-    }
-  }
-
   bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
-  // Cannot currently be undone. Will write protect the entire code region.
-  // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
-  void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
+
   void ResetCodePtr() { T::SetCodePtr(region); }
   size_t GetSpaceLeft() const
   {
diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp
deleted file mode 100644
index 7273a8a..0000000
--- a/src/dolphin/MemoryUtil.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright 2008 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#include <cstddef>
-#include <cstdlib>
-#include <string>
-
-#include "../types.h"
-#include "CommonFuncs.h"
-#include "Log.h"
-
-#ifdef _WIN32
-#include <windows.h>
-//#include "Common/StringUtil.h"
-#else
-#include <stdio.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#if defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
-#include <sys/sysctl.h>
-#elif defined __HAIKU__
-#include <OS.h>
-#else
-#include <sys/sysinfo.h>
-#endif
-#endif
-
-namespace Common
-{
-// This is purposely not a full wrapper for virtualalloc/mmap, but it
-// provides exactly the primitive operations that Dolphin needs.
-
-void* AllocateExecutableMemory(size_t size)
-{
-#if defined(_WIN32)
-  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
-#else
-  void* ptr =
-      mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
-
-  if (ptr == MAP_FAILED)
-    ptr = nullptr;
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate executable memory");
-
-  return ptr;
-}
-
-void* AllocateMemoryPages(size_t size)
-{
-#ifdef _WIN32
-  void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_READWRITE);
-#else
-  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
-
-  if (ptr == MAP_FAILED)
-    ptr = nullptr;
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate raw memory");
-
-  return ptr;
-}
-
-void* AllocateAlignedMemory(size_t size, size_t alignment)
-{
-#ifdef _WIN32
-  void* ptr = _aligned_malloc(size, alignment);
-#else
-  void* ptr = nullptr;
-  if (posix_memalign(&ptr, alignment, size) != 0)
-    ERROR_LOG(MEMMAP, "Failed to allocate aligned memory");
-#endif
-
-  if (ptr == nullptr)
-    PanicAlert("Failed to allocate aligned memory");
-
-  return ptr;
-}
-
-void FreeMemoryPages(void* ptr, size_t size)
-{
-  if (ptr)
-  {
-#ifdef _WIN32
-    if (!VirtualFree(ptr, 0, MEM_RELEASE))
-      PanicAlert("FreeMemoryPages failed!\nVirtualFree: %s", GetLastErrorString().c_str());
-#else
-    if (munmap(ptr, size) != 0)
-      PanicAlert("FreeMemoryPages failed!\nmunmap: %s", LastStrerrorString().c_str());
-#endif
-  }
-}
-
-void FreeAlignedMemory(void* ptr)
-{
-  if (ptr)
-  {
-#ifdef _WIN32
-    _aligned_free(ptr);
-#else
-    free(ptr);
-#endif
-  }
-}
-
-void ReadProtectMemory(void* ptr, size_t size)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue))
-    PanicAlert("ReadProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size, PROT_NONE) != 0)
-    PanicAlert("ReadProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-#endif
-}
-
-void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, &oldValue))
-    PanicAlert("WriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size, allowExecute ? (PROT_READ | PROT_EXEC) : PROT_READ) != 0)
-    PanicAlert("WriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-#endif
-}
-
-void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
-{
-#ifdef _WIN32
-  DWORD oldValue;
-  if (!VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldValue))
-    PanicAlert("UnWriteProtectMemory failed!\nVirtualProtect: %s", GetLastErrorString().c_str());
-#else
-  if (mprotect(ptr, size,
-               allowExecute ? (PROT_READ | PROT_WRITE | PROT_EXEC) : PROT_WRITE | PROT_READ) != 0)
-  {
-    PanicAlert("UnWriteProtectMemory failed!\nmprotect: %s", LastStrerrorString().c_str());
-  }
-#endif
-}
-
-size_t MemPhysical()
-{
-#ifdef _WIN32
-  MEMORYSTATUSEX memInfo;
-  memInfo.dwLength = sizeof(MEMORYSTATUSEX);
-  GlobalMemoryStatusEx(&memInfo);
-  return memInfo.ullTotalPhys;
-#elif defined __APPLE__ || defined __FreeBSD__ || defined __OpenBSD__
-  int mib[2];
-  size_t physical_memory;
-  mib[0] = CTL_HW;
-#ifdef __APPLE__
-  mib[1] = HW_MEMSIZE;
-#elif defined __FreeBSD__
-  mib[1] = HW_REALMEM;
-#elif defined __OpenBSD__
-  mib[1] = HW_PHYSMEM;
-#endif
-  size_t length = sizeof(size_t);
-  sysctl(mib, 2, &physical_memory, &length, NULL, 0);
-  return physical_memory;
-#elif defined __HAIKU__
-  system_info sysinfo;
-  get_system_info(&sysinfo);
-  return static_cast<size_t>(sysinfo.max_pages * B_PAGE_SIZE);
-#else
-  struct sysinfo memInfo;
-  sysinfo(&memInfo);
-  return (size_t)memInfo.totalram * memInfo.mem_unit;
-#endif
-}
-
-}  // namespace Common
diff --git a/src/dolphin/MemoryUtil.h b/src/dolphin/MemoryUtil.h
deleted file mode 100644
index 607b7a8..0000000
--- a/src/dolphin/MemoryUtil.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2008 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <string>
-
-namespace Common
-{
-void* AllocateExecutableMemory(size_t size);
-void* AllocateMemoryPages(size_t size);
-void FreeMemoryPages(void* ptr, size_t size);
-void* AllocateAlignedMemory(size_t size, size_t alignment);
-void FreeAlignedMemory(void* ptr);
-void ReadProtectMemory(void* ptr, size_t size);
-void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
-void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false);
-size_t MemPhysical();
-
-}  // namespace Common
-- 
cgit v1.2.3


From 86f2be7260f9a9b51efd7c795c28cdcfda775742 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 19:24:00 +0200
Subject: jit: add compile option

---
 CMakeLists.txt                     | 36 ++++++++++++++++++++++
 src/ARM.cpp                        | 13 ++++----
 src/ARM.h                          |  6 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 -
 src/CMakeLists.txt                 | 25 +++++++++-------
 src/CP15.cpp                       | 12 ++++++--
 src/Config.cpp                     |  4 +++
 src/Config.h                       |  2 ++
 src/NDS.cpp                        | 26 ++++++++++++++++
 src/dolphin/CodeBlock.h            |  3 --
 11 files changed, 136 insertions(+), 53 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 885f0dd..1e53c60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,42 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+detect_architecture("__x86_64__" x86_64)
+detect_architecture("__i386__" x86)
+detect_architecture("__arm__" ARM)
+detect_architecture("__aarch64__" ARM64)
+
+if (ARCHITECTURE STREQUAL x86_64)
+	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
+endif()
+
+if (ENABLE_JIT)
+	add_definitions(-DJIT_ENABLED)
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL Release)
+	option(ENABLE_LTO "Enable link-time optimization" ON)
+else()
+	option(ENABLE_LTO "Enable link-time optimization" OFF)
+endif()
+
 if (CMAKE_BUILD_TYPE STREQUAL Debug)
 	add_compile_options(-Og)
 endif()
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 1cd4bb2..bfe1890 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -81,15 +81,8 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
-namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
-
 void ARM::Reset()
 {
-    FILE* blabla = fopen("fhhg", "w");
-    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
-        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
-    fclose(blabla);
-
     Cycles = 0;
     Halted = 0;
 
@@ -591,6 +584,7 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv5::ExecuteJIT()
 {
     if (Halted)
@@ -642,6 +636,7 @@ void ARMv5::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
+#endif
 
 void ARMv4::Execute()
 {
@@ -720,6 +715,7 @@ void ARMv4::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv4::ExecuteJIT()
 {
     if (Halted)
@@ -771,4 +767,5 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
-}
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index 3b01ef3..c3e7f44 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,7 +52,9 @@ public:
     }
 
     virtual void Execute() = 0;
+#ifdef ENABLE_JIT
     virtual void ExecuteJIT() = 0;
+#endif
 
     bool CheckCondition(u32 code)
     {
@@ -160,7 +162,9 @@ public:
     void DataAbort();
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -283,7 +287,9 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fe23859..18cb27e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,7 +4,10 @@
 
 #include <assert.h>
 
+#include "../dolphin/CommonFuncs.h"
+
 #ifdef _WIN32
+#include <windows.h>
 #else
 #include <sys/mman.h>
 #include <unistd.h>
@@ -32,8 +35,6 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
-int instructionPopularityARM[ARMInstrInfo::ak_Count];
-
 /*
     We'll repurpose this .bss memory
 
@@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32];
 
 Compiler::Compiler()
 {
-#ifdef _WIN32
-#else
-    u64 pagesize = sysconf(_SC_PAGE_SIZE);
-#endif
-
-    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
-    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
-
-#ifdef _WIN32
-#else
-    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
-#endif
-
-    region = pageAligned;
-    region_size = alignedSize;
-    total_region_size = region_size;
+    {
+    #ifdef _WIN32
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+
+        u64 pageSize = (u64)sysInfo.dwPageSize;
+    #else
+        u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    #endif
+
+        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
+        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;
+
+    #ifdef _WIN32
+        DWORD dummy;
+        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
+    #else
+        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+    #endif
+
+        region = pageAligned;
+        region_size = alignedSize;
+        total_region_size = region_size;
+    }
 
     ClearCodeSpace();
 
-    SetCodePtr(pageAligned);
-
-    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
-
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -118,7 +123,7 @@ Compiler::Compiler()
         SetJumpTarget(und);
         MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
-        }
+    }
     {
         // RSCRATCH  mode
         // ABI_PARAM2 reg n
@@ -163,7 +168,10 @@ Compiler::Compiler()
         RET();
     }
 
-    ResetStart = (void*)GetWritableCodePtr();
+    // move the region forward to prevent overwriting the generated functions
+    region_size -= GetWritableCodePtr() - region;
+    total_region_size = region_size;
+    region = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    SetCodePtr((u8*)ResetStart);
+    ClearCodeSpace();
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
@@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (!Thumb)
-            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
-
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index cd58012..0ce7d8d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -132,7 +132,6 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
-    void* ResetStart;
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 75fa42c..bfc0ad9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -49,19 +49,22 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+)
 
-	ARMJIT.cpp
-	ARMJIT_x64/ARMJIT_Compiler.cpp
-	ARMJIT_x64/ARMJIT_ALU.cpp
-	ARMJIT_x64/ARMJIT_LoadStore.cpp
-	ARMJIT_x64/ARMJIT_Branch.cpp
+if (ENABLE_JIT)
+	target_sources(core PRIVATE
+		ARMJIT.cpp
+		ARMJIT_x64/ARMJIT_Compiler.cpp
+		ARMJIT_x64/ARMJIT_ALU.cpp
+		ARMJIT_x64/ARMJIT_LoadStore.cpp
+		ARMJIT_x64/ARMJIT_Branch.cpp
 
-	dolphin/CommonFuncs.cpp
-	dolphin/x64ABI.cpp
-	dolphin/x64CPUDetect.cpp
-	dolphin/x64Emitter.cpp
-	dolphin/MemoryUtil.cpp
-)
+		dolphin/CommonFuncs.cpp
+		dolphin/x64ABI.cpp
+		dolphin/x64CPUDetect.cpp
+		dolphin/x64Emitter.cpp
+	)
+endif()
 
 if (WIN32)
 	target_link_libraries(core ole32 comctl32 ws2_32 opengl32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 3e1c08b..5b5f935 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -813,7 +813,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -835,7 +837,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -857,8 +861,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -880,8 +886,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
+#ifdef JIT_ENABLED
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/Config.cpp b/src/Config.cpp
index 5c0892a..33bab75 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,8 +37,10 @@ char DSiBIOS7Path[1024];
 char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+#endif
 
 ConfigEntry ConfigFile[] =
 {
@@ -51,8 +53,10 @@ ConfigEntry ConfigFile[] =
     {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023},
     {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023},
 
+#ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+#endif
 
     {"", -1, NULL, 0, NULL, 0}
 };
diff --git a/src/Config.h b/src/Config.h
index 9dda157..9296335 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -51,8 +51,10 @@ extern char DSiBIOS7Path[1024];
 extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+#endif
 
 }
 
diff --git a/src/NDS.cpp b/src/NDS.cpp
index cb85d13..7636a07 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -169,7 +169,9 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+#ifdef JIT_ENABLED
     ARMJIT::Init();
+#endif
 
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
@@ -203,7 +205,9 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+#ifdef JIT_ENABLED
     ARMJIT::DeInit();
+#endif
 
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
@@ -566,7 +570,9 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+#ifdef JIT_ENABLED
     ARMJIT::InvalidateBlockCache();
+#endif
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -794,10 +800,12 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+#ifdef JIT_ENABLED
     if (!file->Saving)
     {
         ARMJIT::InvalidateBlockCache();
     }
+#endif
 
     return true;
 }
@@ -923,9 +931,11 @@ u32 RunFrame()
         }
         else
         {
+#ifdef JIT_ENABLED
             if (EnableJIT)
                 ARM9->ExecuteJIT();
             else
+#endif
                 ARM9->Execute();
         }
 
@@ -949,9 +959,11 @@ u32 RunFrame()
             }
             else
             {
+#ifdef JIT_ENABLED
                 if (EnableJIT)
                     ARM7->ExecuteJIT();
                 else
+#endif
                     ARM7->Execute();
             }
 
@@ -984,9 +996,11 @@ u32 RunFrame()
 
 u32 RunFrame()
 {
+#ifdef JIT_ENABLED
     if (Config::JIT_Enable)
         return RunFrame<true>();
     else
+#endif
         return RunFrame<false>();
 }
 
@@ -1998,7 +2012,9 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2050,7 +2066,9 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2118,7 +2136,9 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2414,7 +2434,9 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2475,7 +2497,9 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2546,7 +2570,9 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 31a8d93..e71cf6d 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -9,7 +9,6 @@
 
 #include "Assert.h"
 #include "../types.h"
-#include "MemoryUtil.h"
 
 namespace Common
 {
@@ -41,8 +40,6 @@ public:
   CodeBlock() = default;
   virtual ~CodeBlock()
   {
-    if (region)
-      FreeCodeSpace();
   }
   CodeBlock(const CodeBlock&) = delete;
   CodeBlock& operator=(const CodeBlock&) = delete;
-- 
cgit v1.2.3


From dd04cef47ea3e006788e8d10f5e79035e11ca139 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 15 Jul 2019 19:17:10 +0200
Subject: jit: fix BLX_reg with rn=lr

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp | 3 ++-
 src/ARM_InstrInfo.cpp            | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 05c8ec6..1f95a90 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -187,9 +187,10 @@ void Compiler::A_Comp_BranchImm()
 void Compiler::A_Comp_BranchXchangeReg()
 {
     OpArg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(32, R(RSCRATCH), rn);
     if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
         MOV(32, MapReg(14), Imm32(R15 - 4));
-    Comp_JumpTo(rn.GetSimpleReg());
+    Comp_JumpTo(RSCRATCH);
 }
 
 void Compiler::T_Comp_BCOND()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b8dff00..c36d6c1 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -359,10 +359,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         }
 
         if (data & A_Link)
-        {
             res.DstRegs |= 1 << 14;
-            res.SrcRegs |= 1 << 15;
-        }
 
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
-- 
cgit v1.2.3


From 3167ddcde1b99823e2b2a23f6dd611f3d67f1293 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 15 Jul 2019 20:34:08 +0200
Subject: jit: LDM/STM keep proper stack alignment

---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 15a40f8..ee0a7af 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -480,11 +480,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
             : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
 
+    // we need to make sure that the stack stays aligned to 16 bytes
+    u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
+
     MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
 
         CALL(Num == 0
@@ -508,7 +511,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     POP(ABI_PARAM3);
                     CALL(WriteBanked);
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
-                    if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg))
+                    if (RegCache.Mapping[reg] != INVALID_REG)
                         MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
                     SaveReg(reg, ABI_PARAM3);
                     SetJumpTarget(sucessfulWritten);
@@ -529,6 +532,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
         }
 
+        if (regsCount & 1)
+            POP(RSCRATCH);
+
         if (regs[15])
         {
             if (Num == 1)
@@ -543,6 +549,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        if (regsCount & 1)
+            PUSH(RSCRATCH);
+
         bool firstUserMode = true;
         for (int reg : regs)
         {
@@ -572,6 +581,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 PUSH(MapReg(reg).GetSimpleReg());
             }
         }
+
         MOV(64, R(ABI_PARAM2), R(RSP));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
@@ -579,7 +589,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             ? MemoryFuncsSeq9[1][preinc]
             : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
 
-        ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
+        ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     }
 
     return offset;
-- 
cgit v1.2.3


From 03b321f540f0f546408a85eb0437e66d21befb75 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Wed, 17 Jul 2019 03:18:37 +0200
Subject: jit: fix misc static branch things

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp   | 27 +++++++++++++++++++++++----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 15 ++++++++++-----
 src/ARM_InstrInfo.cpp              | 11 ++++-------
 3 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 1f95a90..6ae4aad 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -35,6 +35,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         u32 newregion = addr >> 24;
 
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
         MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
@@ -53,7 +54,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             if (addr & 0x2)
             {
                 nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
-                cycles += CurCPU->CodeCycles;
+                cycles += cpu9->CodeCycles;
                 nextInstr[1] = cpu9->CodeRead32(addr+2, false);
                 cycles += CurCPU->CodeCycles;
             }
@@ -61,7 +62,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             {
                 nextInstr[0] = cpu9->CodeRead32(addr, true);
                 nextInstr[1] = nextInstr[0] >> 16;
-                cycles += CurCPU->CodeCycles;
+                cycles += cpu9->CodeCycles;
             }
         }
         else
@@ -74,6 +75,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             nextInstr[1] = cpu9->CodeRead32(addr+4, false);
             cycles += cpu9->CodeCycles;
         }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+        if (setupRegion)
+            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -86,26 +91,40 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = codeCycles;
 
         MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
 
         if (addr & 0x1)
         {
             addr &= ~0x1;
             newPC = addr+2;
 
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
             nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
             nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
             cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
         }
         else
         {
             addr &= ~0x3;
             newPC = addr+4;
 
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
             nextInstr[0] = cpu7->CodeRead32(addr);
             nextInstr[1] = cpu7->CodeRead32(addr+4);
             cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
         }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
     }
 
     MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
@@ -204,7 +223,7 @@ void Compiler::T_Comp_BCOND()
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
     Comp_AddCycles_C(true);
-    SetJumpTarget(skipFailed);
+   SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 18cb27e..1e871fd 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -354,8 +354,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     if (IsAlmostFull())
         InvalidateBlockCache();
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
     ConstantCycles = 0;
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
@@ -363,6 +361,13 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
+    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
+
+    if (!IsMapped(Num, R15 - Thumb ? 2 : 4))
+    {
+        printf("Trying to compile a block in unmapped memory\n");
+    }
+
     bool mergedThumbBL = false;
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
@@ -383,7 +388,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (comp == NULL || i == instrsCount - 1)
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
             MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
@@ -454,10 +460,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 else
                     (this->*comp)();
 
-                FixupBranch skipFailed;
                 if (CurInstr.Cond() < 0xE)
                 {
-                    skipFailed = J();
+                    FixupBranch skipFailed = J();
                     SetJumpTarget(skipExecute);
 
                     Comp_AddCycles_C();
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index c36d6c1..5db2471 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -178,7 +178,6 @@ enum {
 
     T_ReadR13       = 1 << 9,
     T_WriteR13      = 1 << 10,
-    T_ReadR15       = 1 << 11,
 
     T_BranchAlways  = 1 << 12,
     T_ReadR14       = 1 << 13,
@@ -222,7 +221,7 @@ const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
 const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
 const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
-const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL);
+const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
 const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
 
@@ -257,11 +256,11 @@ const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
 const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
 const u32 T_B = T_BranchAlways | tk(tk_B);
-const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1);
-const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2);
+const u32 T_BL_LONG_1 = T_WriteR14 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | tk(tk_BL_LONG_2);
 
 const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
-const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC);
 
 #define INSTRFUNC_PROTO(x) u32 x
 #include "ARM_InstrTable.h"
@@ -299,8 +298,6 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SrcRegs |= (1 << 13);
         if (data & T_WriteR13)
             res.DstRegs |= (1 << 13);
-        if (data & T_ReadR15)
-            res.SrcRegs |= (1 << 15);
         if (data & T_WriteR14)
             res.DstRegs |= (1 << 14);
         if (data & T_ReadR14)
-- 
cgit v1.2.3


From 4deecc7d65e61c13d214b46c105dcfb381aacc54 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 13:36:48 +0200
Subject: jit: decrease blockcache AddrMapping size for ARM9

---
 src/ARM.cpp                        |  8 ++---
 src/ARMJIT.cpp                     | 18 ++++++----
 src/ARMJIT.h                       | 67 ++++++++++++++++++++++++++++----------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  4 ++-
 src/NDS.cpp                        | 12 +++----
 5 files changed, 74 insertions(+), 35 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index bfe1890..dd0be6a 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -609,14 +609,14 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped(0, instrAddr))
+        if (!ARMJIT::IsMapped<0>(instrAddr))
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr);
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
         Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         if (Halted)
@@ -740,13 +740,13 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped(1, instrAddr))
+        if (!ARMJIT::IsMapped<1>(instrAddr))
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr);
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
         Cycles += (block ? block : ARMJIT::CompileBlock(this))();
 
         // TODO optimize this shit!!!
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index e8e6be0..aad14c0 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -109,11 +109,14 @@ void Init()
 {
     memset(&cache, 0, sizeof(BlockCache));
 
-    for (int cpu = 0; cpu < 2; cpu++)
-        for (int i = 0; i < 0x4000; i++)
-            cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL :
-				(CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9])
-                + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1);
+	for (int i = 0; i < 0x2000; i++)
+		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
+			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
+			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	for (int i = 0; i < 0x4000; i++)
+		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
+			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
+			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
 
 	compiler = new Compiler();
 }
@@ -175,7 +178,10 @@ CompiledBlock CompileBlock(ARM* cpu)
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, blockAddr, block);
+	if (cpu->Num == 0)
+    	InsertBlock<0>(blockAddr, block);
+	else
+    	InsertBlock<1>(blockAddr, block);
 
 	return block;
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 004256c..0fc1c38 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -47,9 +47,11 @@ struct FetchedInstr
 		a function which executes a block instructions starting from there.
 
 		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x4000 16 KB blocks, each of which a pointer to the relevant
-		place inside the before mentioned arrays. Only half of the bytes need to be
-		addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary).
+		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
+		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
+		are the sizes of the smallest contigous memory region mapped to the respective CPU.
+		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
+		we only need every second half word to be adressable.
 
 		In case a memory write hits mapped memory, the function block at this
 		address is set to null, so it's recompiled the next time it's executed.
@@ -61,7 +63,8 @@ struct FetchedInstr
 
 struct BlockCache
 {
-    CompiledBlock* AddrMapping[2][0x4000] = {0};
+    CompiledBlock* AddrMapping9[0x2000] = {0};
+    CompiledBlock* AddrMapping7[0x4000] = {0};
 
     CompiledBlock MainRAM[4*1024*1024/2];
 	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
@@ -75,35 +78,63 @@ struct BlockCache
 
 extern BlockCache cache;
 
-inline bool IsMapped(u32 num, u32 addr)
+template <u32 num>
+inline bool IsMapped(u32 addr)
 {
-	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
+	if (num == 0)
+		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+	else
+		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
 }
 
-inline CompiledBlock LookUpBlock(u32 num, u32 addr)
+template <u32 num>
+inline CompiledBlock LookUpBlock(u32 addr)
 {
-	return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+	if (num == 0)
+		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+	else
+		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
 }
 
-inline void Invalidate16(u32 num, u32 addr)
+template <u32 num>
+inline void Invalidate16(u32 addr)
 {
-	if (IsMapped(num, addr))
-		cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+	if (IsMapped<num>(addr))
+	{
+		if (num == 0)
+			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
+		else
+			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
+	}
 }
 
-inline void Invalidate32(u32 num, u32 addr)
+template <u32 num>
+inline void Invalidate32(u32 addr)
 {
-	if (IsMapped(num, addr))
+	if (IsMapped<num>(addr))
 	{
-		CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14];
-		page[(addr & 0x3FFF) >> 1] = NULL;
-		page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+		if (num == 0)
+		{
+			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+			page[(addr & 0x7FFF) >> 1] = NULL;
+			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
+		}
+		else
+		{
+			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+			page[(addr & 0x3FFF) >> 1] = NULL;
+			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
+		}
 	}
 }
 
-inline void InsertBlock(u32 num, u32 addr, CompiledBlock func)
+template <u32 num>
+inline void InsertBlock(u32 addr, CompiledBlock func)
 {
-	cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	if (num == 0)
+		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
+	else
+		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
 }
 
 void Init();
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 1e871fd..cb11f73 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -363,7 +363,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
-    if (!IsMapped(Num, R15 - Thumb ? 2 : 4))
+    if (!(Num == 0 
+        ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) 
+        : IsMapped<1>(R15 - (Thumb ? 2 : 4))))
     {
         printf("Trying to compile a block in unmapped memory\n");
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 7636a07..3de9c1f 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -2013,7 +2013,7 @@ u32 ARM9Read32(u32 addr)
 void ARM9Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(0, addr);
+    ARMJIT::Invalidate16<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -2067,7 +2067,7 @@ void ARM9Write8(u32 addr, u8 val)
 void ARM9Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(0, addr);
+    ARMJIT::Invalidate16<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -2137,7 +2137,7 @@ void ARM9Write16(u32 addr, u16 val)
 void ARM9Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32(0, addr);
+    ARMJIT::Invalidate32<0>(addr);
 #endif
 
     switch (addr & 0xFF000000)
@@ -2435,7 +2435,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(1, addr);
+    ARMJIT::Invalidate16<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2498,7 +2498,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16(1, addr);
+    ARMJIT::Invalidate16<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2571,7 +2571,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32(1, addr);
+    ARMJIT::Invalidate32<1>(addr);
 #endif
 
     switch (addr & 0xFF800000)
-- 
cgit v1.2.3


From d74b15eecc4c5d82703535e0d5c687c3cf225eae Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 17:28:16 +0200
Subject: jit: fix thumb hi reg alu and mcr halt + mcr/mrc aren't always,
 msr_imm is never unk on ARM7

---
 src/ARMJIT.cpp                   |  2 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp    |  4 +---
 src/ARMJIT_x64/ARMJIT_Branch.cpp | 21 ++++++++++++++-------
 src/ARM_InstrInfo.cpp            | 33 ++++++++++++++++++++++++++++-----
 src/ARM_InstrInfo.h              |  1 +
 5 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index aad14c0..6948eee 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -174,7 +174,7 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
-    } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 013f54c..bdf06f7 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg()
     switch (op)
     {
     case 0x0: // ADD
-        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV);
+        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric);
         break;
     case 0x1: // CMP
         Comp_CmpOp(2, rdMapped, rs, false);
@@ -671,8 +671,6 @@ void Compiler::T_Comp_ALU_HiReg()
     case 0x2: // MOV
         if (rdMapped != rs)
             MOV(32, rdMapped, rs);
-        TEST(32, rdMapped, rdMapped);
-        Comp_RetriveFlags(false, false, false);
         break;
     }
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 6ae4aad..9d4c1e2 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -235,16 +235,23 @@ void Compiler::T_Comp_B()
 void Compiler::T_Comp_BranchXchangeReg()
 {
     bool link = CurInstr.Instr & (1 << 7);
-    if (link && Num == 1)
-    {
-        printf("BLX unsupported on ARM7!!!\n");
-        return;
-    }
 
-    OpArg rn = MapReg(CurInstr.A_Reg(3));
     if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3)));
         MOV(32, MapReg(14), Imm32(R15 - 1));
-    Comp_JumpTo(rn.GetSimpleReg());
+        Comp_JumpTo(RSCRATCH);
+    }
+    else
+    {
+        OpArg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn.GetSimpleReg());
+    }
 }
 
 void Compiler::T_Comp_BL_LONG_1()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 5db2471..b70c8dc 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -152,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
 const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
 
 const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
-const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM);
-const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG);
-const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS);
-const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR);
-const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC);
+const u32 A_MSR_IMM = ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | ak(ak_MRC);
 const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
@@ -310,6 +310,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.DstRegs |= 1 << 15;
 
         res.Kind = (data >> 16) & 0x3F;
+        res.EndBlock = res.Branches();
 
         return res;
     }
@@ -324,6 +325,26 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         res.Kind = (data >> 13) & 0x1FF;
 
+        if (res.Kind == ak_MCR)
+        {
+            u32 cn = (instr >> 16) & 0xF;
+            u32 cm = instr & 0xF;
+            u32 cpinfo = (instr >> 5) & 0x7;
+            u32 id = (cn<<8)|(cm<<4)|cpinfo;
+            if (id == 0x704 || id == 0x782)
+                res.EndBlock |= true;
+        }
+        if (res.Kind == ak_MCR || res.Kind == ak_MRC)
+        {
+            u32 cp = ((instr >> 8) & 0xF);
+            if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
+            {
+                printf("happens\n");
+                data = A_UNK;
+                res.Kind = ak_UNK;
+            }
+        }
+
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
         if (data & A_Read16)
@@ -361,6 +382,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        res.EndBlock |= res.Branches();
+
         return res;
     }
 }
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 51dcfa2..4fe9b10 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -220,6 +220,7 @@ struct Info
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    bool EndBlock;
     bool Branches()
     {
         return DstRegs & (1 << 15);
-- 
cgit v1.2.3


From 00cd9af033cfaffce262dc972170dbec177af3e1 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 23:56:24 +0200
Subject: fix uninitialised memory mapping

---
 src/ARM.cpp  | 1 -
 src/CP15.cpp | 2 ++
 src/NDS.cpp  | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index dd0be6a..50ef8fd 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -124,7 +124,6 @@ void ARMv5::Reset()
         GetMemRegion = NDS::ARM9GetMemRegion;
     }
 
-    CP15Reset();
     ARM::Reset();
 }
 
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 5b5f935..77244f2 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -258,9 +258,11 @@ void ARMv5::UpdatePURegions(bool update_all)
 
 void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend)
 {
+    printf("initialising region timings %x %x\n", addrstart, addrend);
     addrstart >>= 12;
     addrend   >>= 12;
 
+
     if (addrend == 0xFFFFF) addrend++;
 
     for (u32 i = addrstart; i < addrend; i++)
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 3de9c1f..0bde139 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -501,6 +501,10 @@ void Reset()
         ARM9ClockShift = 1;
         MainRAMMask = 0x3FFFFF;
     }
+    // has to be called before InitTimings
+    // otherwise some PU settings are completely
+    // unitialised on the first run
+    ARM9->CP15Reset();
 
     ARM9Timestamp = 0; ARM9Target = 0;
     ARM7Timestamp = 0; ARM7Target = 0;
-- 
cgit v1.2.3


From 0d786573aba04573bb5909b98c4af7a38a750224 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 21 Jul 2019 23:59:02 +0200
Subject: remove debug printf

---
 src/CP15.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/CP15.cpp b/src/CP15.cpp
index 77244f2..5b5f935 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -258,11 +258,9 @@ void ARMv5::UpdatePURegions(bool update_all)
 
 void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend)
 {
-    printf("initialising region timings %x %x\n", addrstart, addrend);
     addrstart >>= 12;
     addrend   >>= 12;
 
-
     if (addrend == 0xFFFFF) addrend++;
 
     for (u32 i = addrstart; i < addrend; i++)
-- 
cgit v1.2.3


From 851930f5e0605f8e6fce1c6b1ce110dfe5d3decd Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Mon, 22 Jul 2019 01:04:42 +0200
Subject: jit: fix RSC

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index bdf06f7..368fd8b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -181,7 +181,7 @@ void Compiler::A_Comp_Arith()
         Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
         break;
     case 0x6: // SBC
-        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
         break;
     case 0x7: // RSC
         Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
-- 
cgit v1.2.3


From 86b96ca47a3b08a16ed5ed865b2d1bdb46a6c8cb Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 16 Aug 2019 23:17:08 +0200
Subject: remove unneeded dolphin code, C++11 static_assert

---
 src/ARMJIT.cpp                      |  2 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  4 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 19 ++++----
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  5 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  2 +-
 src/dolphin/Assert.h                | 47 -------------------
 src/dolphin/CodeBlock.h             | 91 -------------------------------------
 src/dolphin/Compat.h                | 63 +++++++++++++++++++++++++
 src/dolphin/Intrinsics.h            | 72 -----------------------------
 src/dolphin/Log.h                   | 21 ---------
 src/dolphin/x64CPUDetect.cpp        |  1 -
 src/dolphin/x64Emitter.cpp          |  3 +-
 src/dolphin/x64Emitter.h            | 13 +-----
 13 files changed, 84 insertions(+), 259 deletions(-)
 delete mode 100644 src/dolphin/Assert.h
 delete mode 100644 src/dolphin/CodeBlock.h
 create mode 100644 src/dolphin/Compat.h
 delete mode 100644 src/dolphin/Intrinsics.h
 delete mode 100644 src/dolphin/Log.h

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 6948eee..74554d7 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -188,6 +188,8 @@ CompiledBlock CompileBlock(ARM* cpu)
 
 void InvalidateBlockCache()
 {
+	printf("Resetting JIT block cache...\n");
+
 	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
 	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
 	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 368fd8b..f0bcf8e 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -257,7 +257,7 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O
         Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1);
     }
 
-    static_assert(EAX == RSCRATCH);
+    static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!");
     MOV(32, R(RSCRATCH), rm);
     if (add)
     {
@@ -383,7 +383,7 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b
     }
 
     MOV(32, R(RSCRATCH), rm);
-    static_assert(RSCRATCH3 == ECX);
+    static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3");
     MOV(32, R(ECX), rs);
     AND(32, R(ECX), Imm32(0xFF));
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index cb11f73..0fbcfda 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -63,12 +63,11 @@ Compiler::Compiler()
         mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
     #endif
 
-        region = pageAligned;
-        region_size = alignedSize;
-        total_region_size = region_size;
+        ResetStart = pageAligned;
+        CodeMemSize = alignedSize;
     }
 
-    ClearCodeSpace();
+    Reset();
 
     for (int i = 0; i < 3; i++)
     {
@@ -169,9 +168,8 @@ Compiler::Compiler()
     }
 
     // move the region forward to prevent overwriting the generated functions
-    region_size -= GetWritableCodePtr() - region;
-    total_region_size = region_size;
-    region = GetWritableCodePtr();
+    CodeMemSize -= GetWritableCodePtr() - ResetStart;
+    ResetStart = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -208,7 +206,7 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
     if (cond >= 0x8)
     {
-        static_assert(RSCRATCH3 == ECX);
+        static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
         MOV(32, R(RSCRATCH3), R(RCPSR));
         SHR(32, R(RSCRATCH3), Imm8(28));
         MOV(32, R(RSCRATCH), Imm32(1));
@@ -346,12 +344,13 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    ClearCodeSpace();
+    memset(ResetStart, 0xcc, CodeMemSize);
+    SetCodePtr(ResetStart);
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
-    if (IsAlmostFull())
+    if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         InvalidateBlockCache();
 
     ConstantCycles = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 0ce7d8d..3151cbc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -17,7 +17,7 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
 
-class Compiler : public Gen::X64CodeBlock
+class Compiler : public Gen::XEmitter
 {
 public:
     Compiler();
@@ -132,6 +132,9 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    u8* ResetStart;
+    u32 CodeMemSize;
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index ee0a7af..6386f8b 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -171,7 +171,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
     RET();
 
-    static_assert(RSCRATCH == EAX);
+    static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!");
 
     return res;
 }
diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h
deleted file mode 100644
index 4eb16e0..0000000
--- a/src/dolphin/Assert.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2015 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <assert.h>
-
-#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (!(_a_))                                                                                    \
-    {                                                                                              \
-      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
-        Crash();                                                                                   \
-    }                                                                                              \
-  } while (0)*/
-
-#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
-  assert(_a_); \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
-    {                                                                                              \
-      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
-      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
-        Crash();                                                                                   \
-    }                                                                                              \
-  } while (0)*/
-
-#define ASSERT(_a_)                                                                                \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
-               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
-               __LINE__, __FILE__);                                                                \
-  } while (0)*/
-
-#define DEBUG_ASSERT(_a_)                                                                          \
-  assert(_a_) \
-  /*do                                                                                               \
-  {                                                                                                \
-    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
-      ASSERT(_a_);                                                                                 \
-  } while (0)*/
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
deleted file mode 100644
index e71cf6d..0000000
--- a/src/dolphin/CodeBlock.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2014 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#include <cstddef>
-#include <vector>
-
-#include "Assert.h"
-#include "../types.h"
-
-namespace Common
-{
-// Everything that needs to generate code should inherit from this.
-// You get memory management for free, plus, you can use all emitter functions without
-// having to prefix them with gen-> or something similar.
-// Example implementation:
-// class JIT : public CodeBlock<ARMXEmitter> {}
-template <class T>
-class CodeBlock : public T
-{
-private:
-  // A privately used function to set the executable RAM space to something invalid.
-  // For debugging usefulness it should be used to set the RAM to a host specific breakpoint
-  // instruction
-  virtual void PoisonMemory() = 0;
-
-protected:
-  u8* region = nullptr;
-  // Size of region we can use.
-  size_t region_size = 0;
-  // Original size of the region we allocated.
-  size_t total_region_size = 0;
-
-  bool m_is_child = false;
-  std::vector<CodeBlock*> m_children;
-
-public:
-  CodeBlock() = default;
-  virtual ~CodeBlock()
-  {
-  }
-  CodeBlock(const CodeBlock&) = delete;
-  CodeBlock& operator=(const CodeBlock&) = delete;
-  CodeBlock(CodeBlock&&) = delete;
-  CodeBlock& operator=(CodeBlock&&) = delete;
-
-  // Always clear code space with breakpoints, so that if someone accidentally executes
-  // uninitialized, it just breaks into the debugger.
-  void ClearCodeSpace()
-  {
-    PoisonMemory();
-    ResetCodePtr();
-  }
-
-  bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
-
-  void ResetCodePtr() { T::SetCodePtr(region); }
-  size_t GetSpaceLeft() const
-  {
-    ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
-    return region_size - (T::GetCodePtr() - region);
-  }
-
-  bool IsAlmostFull() const
-  {
-    // This should be bigger than the biggest block ever.
-    return GetSpaceLeft() < 0x10000;
-  }
-
-  bool HasChildren() const { return region_size != total_region_size; }
-  u8* AllocChildCodeSpace(size_t child_size)
-  {
-    ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
-    u8* child_region = region + region_size - child_size;
-    region_size -= child_size;
-    return child_region;
-  }
-  void AddChildCodeSpace(CodeBlock* child, size_t child_size)
-  {
-    u8* child_region = AllocChildCodeSpace(child_size);
-    child->m_is_child = true;
-    child->region = child_region;
-    child->region_size = child_size;
-    child->total_region_size = child_size;
-    child->ResetCodePtr();
-    m_children.emplace_back(child);
-  }
-};
-}  // namespace Common
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
new file mode 100644
index 0000000..f2f52a5
--- /dev/null
+++ b/src/dolphin/Compat.h
@@ -0,0 +1,63 @@
+// Stubs for Assert.h and Log.h
+#pragma once
+
+#include <assert.h>
+
+// Assert stub
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
+
+// Log Stub
+#include <cstdio>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h
deleted file mode 100644
index 483f219..0000000
--- a/src/dolphin/Intrinsics.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2015 Dolphin Emulator Project
-// Licensed under GPLv2+
-// Refer to the license_dolphin.txt file included.
-
-#pragma once
-
-#if defined(_M_X86)
-
-/**
- * It is assumed that all compilers used to build Dolphin support intrinsics up to and including
- * SSE 4.2 on x86/x64.
- */
-
-#if defined(__GNUC__) || defined(__clang__)
-
-/**
- * Due to limitations in GCC, SSE intrinsics are only available when compiling with the
- * corresponding instruction set enabled. However, using the target attribute, we can compile
- * single functions with a different target instruction set, while still creating a generic build.
- *
- * Since this instruction set is enabled per-function, any callers should verify that the
- * instruction set is supported at runtime before calling it, and provide a fallback implementation
- * when not supported.
- *
- * When building with -march=native, or enabling the instruction sets in the compile flags, permit
- * usage of the instrinsics without any function attributes. If the command-line architecture does
- * not support this instruction set, enable it via function targeting.
- */
-
-#include <x86intrin.h>
-#ifndef __SSE4_2__
-#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]]
-#endif
-#ifndef __SSE4_1__
-#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]]
-#endif
-#ifndef __SSSE3__
-#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]]
-#endif
-#ifndef __SSE3__
-#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]]
-#endif
-
-#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
-
-/**
- * MSVC and ICC support intrinsics for any instruction set without any function attributes.
- */
-#include <intrin.h>
-
-#endif  // defined(_MSC_VER) || defined(__INTEL_COMPILER)
-
-#endif  // _M_X86
-
-/**
- * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform.
- * This way when a function is defined with FUNCTION_TARGET you don't need to define a second
- * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use
- * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures.
- */
-#ifndef FUNCTION_TARGET_SSE42
-#define FUNCTION_TARGET_SSE42
-#endif
-#ifndef FUNCTION_TARGET_SSR41
-#define FUNCTION_TARGET_SSR41
-#endif
-#ifndef FUNCTION_TARGET_SSSE3
-#define FUNCTION_TARGET_SSSE3
-#endif
-#ifndef FUNCTION_TARGET_SSE3
-#define FUNCTION_TARGET_SSE3
-#endif
diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h
deleted file mode 100644
index a7f4b6a..0000000
--- a/src/dolphin/Log.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include "CommonFuncs.h"
-
-#include <stdio.h>
-
-#define PanicAlert(fmt, ...) \
-  do \
-  { \
-    printf(fmt "\n", ## __VA_ARGS__); \
-    abort(); \
-  } while (false)
-
-
-#define DYNA_REC 0
-
-#define ERROR_LOG(which, fmt, ...) \
-    do \
-    { \
-        printf(fmt "\n", ## __VA_ARGS__); \
-    } while (false)
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
index 05ee11c..49b51c9 100644
--- a/src/dolphin/x64CPUDetect.cpp
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -7,7 +7,6 @@
 
 #include "CPUDetect.h"
 #include "../types.h"
-#include "Intrinsics.h"
 
 #ifndef _MSVC_VER
 
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
index 7849624..343f314 100644
--- a/src/dolphin/x64Emitter.cpp
+++ b/src/dolphin/x64Emitter.cpp
@@ -7,9 +7,10 @@
 
 #include "CPUDetect.h"
 #include "../types.h"
-#include "Log.h"
 #include "x64Emitter.h"
 #include "x64Reg.h"
+#include "Compat.h"
+#include "CommonFuncs.h"
 
 namespace Gen
 {
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
index 122850d..869acb6 100644
--- a/src/dolphin/x64Emitter.h
+++ b/src/dolphin/x64Emitter.h
@@ -12,9 +12,8 @@
 #include <tuple>
 #include <type_traits>
 
-#include "Assert.h"
+#include "Compat.h"
 #include "BitSet.h"
-#include "CodeBlock.h"
 #include "../types.h"
 #include "x64ABI.h"
 
@@ -1167,14 +1166,4 @@ public:
   }
 };  // class XEmitter
 
-class X64CodeBlock : public Common::CodeBlock<XEmitter>
-{
-private:
-  void PoisonMemory() override
-  {
-    // x86/64: 0xCC = breakpoint
-    memset(region, 0xCC, region_size);
-  }
-};
-
 }  // namespace
-- 
cgit v1.2.3


From 26ecf6bb3c0fe6dec76433662e69903cc453242b Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 17 Aug 2019 14:58:37 +0200
Subject: fix register alloc for half word loads fixes Mega Man Star Force 2
 with cheat applied it probably used a pc relative load which were interpreted
 as branches

---
 src/ARM_InstrInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b70c8dc..4813799 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -127,8 +127,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 #define A_STRD A_Read12Double
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
-    const u32 A_##x##_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG); \
     const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
     const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
 
-- 
cgit v1.2.3


From 316378092ac1791f4ada3b6b81b2681eab14d58d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 17 Aug 2019 16:50:48 +0200
Subject: abandon pipelining on jit fixes Golden Sun Dawn this makes the cpu
 state incompatible between interpreter and JIT. That's why switching cpu mode
 requires a restart(not requiring is stupid anyway) and the pipeline is
 manually filled when making a save state.

---
 src/ARM.cpp                         | 46 ++++++++++++++++++++++++++++++++++++-
 src/ARM.h                           |  6 +++++
 src/ARMJIT.cpp                      |  1 +
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 39 ++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  5 ----
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  5 ----
 6 files changed, 69 insertions(+), 33 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 50ef8fd..7caef75 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -23,6 +23,7 @@
 #include "ARMInterpreter.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
+#include "Config.h"
 
 
 // instruction timing notes
@@ -168,6 +169,13 @@ void ARM::DoSavestate(Savestate* file)
     file->VarArray(R_IRQ, 3*sizeof(u32));
     file->VarArray(R_UND, 3*sizeof(u32));
     file->Var32(&CurInstr);
+    if (!file->Saving && Config::JIT_Enable)
+    {
+        // hack, the JIT doesn't really pipeline
+        // but we still want JIT save states to be
+        // loaded while running the interpreter
+        FillPipeline();
+    }
     file->VarArray(NextInstr, 2*sizeof(u32));
 
     file->Var32(&ExceptionBase);
@@ -767,4 +775,40 @@ void ARMv4::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
-#endif
\ No newline at end of file
+#endif
+
+void ARMv5::FillPipeline()
+{
+    if (CPSR & 0x20)
+    {
+        if ((R[15] - 2) & 0x2)
+        {
+            NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16;
+            NextInstr[1] = CodeRead32(R[15], false);
+        }
+        else
+        {
+            NextInstr[0] = CodeRead32(R[15] - 2, false);
+            NextInstr[1] = NextInstr[0] >> 16;
+        }
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4, false);
+        NextInstr[1] = CodeRead32(R[15], false);
+    }
+}
+
+void ARMv4::FillPipeline()
+{
+    if (CPSR & 0x20)
+    {
+        NextInstr[0] = CodeRead16(R[15] - 2);
+        NextInstr[1] = CodeRead16(R[15]);
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4);
+        NextInstr[1] = CodeRead32(R[15]);
+    }
+}
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index c3e7f44..811b2e0 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -42,6 +42,8 @@ public:
 
     virtual void DoSavestate(Savestate* file);
 
+    virtual void FillPipeline() = 0;
+
     virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0;
     void RestoreCPSR();
 
@@ -156,6 +158,8 @@ public:
 
     void UpdateRegionTimings(u32 addrstart, u32 addrend);
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void PrefetchAbort();
@@ -284,6 +288,8 @@ public:
 
     void Reset();
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 74554d7..949bc1c 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -139,6 +139,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     int i = 0;
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
     do
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 9d4c1e2..30b18d7 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -4,6 +4,14 @@ using namespace Gen;
 
 namespace ARMJIT
 {
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
     
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
@@ -12,9 +20,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     // we'll see how it works out
 
     u32 newPC;
-    u32 nextInstr[2];
     u32 cycles = 0;
-    bool setupRegion = false;
 
     if (addr & 0x1 && !Thumb)
     {
@@ -40,7 +46,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
         MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
-        setupRegion = newregion != oldregion;
+        bool setupRegion = newregion != oldregion;
         if (setupRegion)
             cpu9->SetupCodeMem(addr);
 
@@ -53,15 +59,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             // doesn't matter if we put garbage in the MSbs there
             if (addr & 0x2)
             {
-                nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16;
+                cpu9->CodeRead32(addr-2, true);
                 cycles += cpu9->CodeCycles;
-                nextInstr[1] = cpu9->CodeRead32(addr+2, false);
+                cpu9->CodeRead32(addr+2, false);
                 cycles += CurCPU->CodeCycles;
             }
             else
             {
-                nextInstr[0] = cpu9->CodeRead32(addr, true);
-                nextInstr[1] = nextInstr[0] >> 16;
+                cpu9->CodeRead32(addr, true);
                 cycles += cpu9->CodeCycles;
             }
         }
@@ -70,12 +75,15 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             addr &= ~0x3;
             newPC = addr+4;
 
-            nextInstr[0] = cpu9->CodeRead32(addr, true);
+            cpu9->CodeRead32(addr, true);
             cycles += cpu9->CodeCycles;
-            nextInstr[1] = cpu9->CodeRead32(addr+4, false);
+            cpu9->CodeRead32(addr+4, false);
             cycles += cpu9->CodeCycles;
         }
 
+        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
+        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
+
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
         if (setupRegion)
             cpu9->SetupCodeMem(R15);
@@ -102,8 +110,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             u32 compileTimePC = CurCPU->R[15];
             CurCPU->R[15] = newPC;
 
-            nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr);
-            nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2);
             cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
 
             CurCPU->R[15] = compileTimePC;
@@ -116,8 +122,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             u32 compileTimePC = CurCPU->R[15];
             CurCPU->R[15] = newPC;
 
-            nextInstr[0] = cpu7->CodeRead32(addr);
-            nextInstr[1] = cpu7->CodeRead32(addr+4);
             cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
 
             CurCPU->R[15] = compileTimePC;
@@ -128,19 +132,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     }
 
     MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
-    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0]));
-    MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1]));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
-
-    if (setupRegion)
-    {
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        MOV(32, R(ABI_PARAM2), Imm32(newPC));
-        CALL((void*)&ARMv5::SetupCodeMem);
-    }
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 0fbcfda..ab13cb6 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -395,11 +395,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
             MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
             MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-            if (i == instrsCount - 1)
-            {
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0]));
-                MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1]));
-            }
 
             if (comp == NULL)
                 SaveCPSR();
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 6386f8b..3b4cb7d 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -457,11 +457,6 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
     }
 }
 
-void printStuff2(u32 a, u32 b)
-{
-    printf("b %x %x\n", a, b);
-}
-
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
     int regsCount = regs.Count();
-- 
cgit v1.2.3


From f378458c104f1879f30610dfe4010e4772218787 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 12:28:48 +0200
Subject: optimise away unneeded flag sets - especially useful for thumb code
 and larger max block sizes - can still be improved upon

---
 src/ARMJIT.cpp                     |  24 ++++
 src/ARMJIT.h                       |   1 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp      |  64 +++++++---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |   9 ++
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   6 +-
 src/ARM_InstrInfo.cpp              | 238 +++++++++++++++++++++++--------------
 src/ARM_InstrInfo.h                |  13 ++
 7 files changed, 246 insertions(+), 109 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 949bc1c..3b6bc2e 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -126,6 +126,24 @@ void DeInit()
 	delete compiler;
 }
 
+void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+{
+	for (int j = start; j >= 0; j--)
+	{
+		u8 match = instrs[j].Info.WriteFlags & flags;
+		u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags;
+		if (matchMaybe) // writes flags maybe
+			instrs[j].SetFlags |= matchMaybe;
+		if (match)
+		{
+			instrs[j].SetFlags |= match;
+			flags &= ~match;
+			if (!flags)
+				return;
+		}
+	}
+}
+
 CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -175,8 +193,14 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
+
+		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
+		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
+			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
 
+	floodFillSetFlags(instrs, i - 1, 0xF);
+
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
 	if (cpu->Num == 0)
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 0fc1c38..6197695 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -28,6 +28,7 @@ struct FetchedInstr
         return Instr >> 28;
     }
 
+	u8 SetFlags;
     u32 Instr;
     u32 NextInstr[2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f0bcf8e..6a7d711 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -111,6 +111,8 @@ OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
     }
     else
     {
+        S = S && (CurInstr.SetFlags & 0x2);
+
         int op = (CurInstr.Instr >> 5) & 0x3;
         if (CurInstr.Instr & (1 << 4))
         {
@@ -215,7 +217,8 @@ void Compiler::A_Comp_MovOp()
 
     if (S)
     {
-        TEST(32, rd, rd);
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, carryUsed);
     }
 
@@ -263,12 +266,14 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O
     {
         IMUL(32, RSCRATCH, rs);
         LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg()));
-        TEST(32, rd, rd);
+        if (S && FlagsNZRequired())
+            TEST(32, rd, rd);
     }
     else
     {
         IMUL(32, RSCRATCH, rs);
         MOV(32, rd, R(RSCRATCH));
+        if (S && FlagsNZRequired())
         TEST(32, R(RSCRATCH), R(RSCRATCH));
     }
 
@@ -331,7 +336,7 @@ void Compiler::A_Comp_SMULL_SMLAL()
     else
     {
         IMUL(64, RSCRATCH2, R(RSCRATCH3));
-        if (S)
+        if (S && FlagsNZRequired())
             TEST(64, R(RSCRATCH2), R(RSCRATCH2));
     }
 
@@ -345,9 +350,20 @@ void Compiler::A_Comp_SMULL_SMLAL()
 
 void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
 {
-    CPSRDirty = true;
+    if (CurInstr.SetFlags == 0)
+        return;
+    if (retriveCV && !(CurInstr.SetFlags & 0x3))
+        retriveCV = false;
 
     bool carryOnly = !retriveCV && carryUsed;
+    if (carryOnly && !(CurInstr.SetFlags & 0x2))
+    {
+        carryUsed = false;
+        carryOnly = false;
+    }
+
+    CPSRDirty = true;
+
     if (retriveCV)
     {
         SETcc(CC_O, R(RSCRATCH));
@@ -355,19 +371,28 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
     }
 
-    SETcc(CC_S, R(RSCRATCH));
-    SETcc(CC_Z, R(RSCRATCH3));
-    LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
-    int shiftAmount = 30;
-    if (retriveCV || carryUsed)
+    if (FlagsNZRequired())
     {
-        LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
-        shiftAmount = carryOnly ? 29 : 28;
-    }
-    SHL(32, R(RSCRATCH), Imm8(shiftAmount));
+        SETcc(CC_S, R(RSCRATCH));
+        SETcc(CC_Z, R(RSCRATCH3));
+        LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
+        int shiftAmount = 30;
+        if (retriveCV || carryUsed)
+        {
+            LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
+            shiftAmount = carryOnly ? 29 : 28;
+        }
+        SHL(32, R(RSCRATCH), Imm8(shiftAmount));
 
-    AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
-    OR(32, R(RCPSR), R(RSCRATCH));
+        AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH));
+    }
+    else
+    {
+        SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28));
+        AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH2));
+    }
 }
 
 // always uses RSCRATCH, RSCRATCH2 only if S == true
@@ -523,7 +548,8 @@ void Compiler::T_Comp_ShiftImm()
     if (shifted != rd)
         MOV(32, rd, shifted);
 
-    TEST(32, rd, rd);
+    if (FlagsNZRequired())
+        TEST(32, rd, rd);
     Comp_RetriveFlags(false, false, carryUsed);
 }
 
@@ -557,7 +583,8 @@ void Compiler::T_Comp_ALU_Imm8()
     {
     case 0x0:
         MOV(32, rd, imm);
-        TEST(32, rd, rd);
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
         Comp_RetriveFlags(false, false, false);
         return;
     case 0x1:
@@ -607,7 +634,8 @@ void Compiler::T_Comp_ALU()
             int shiftOp = op == 0x7 ? 3 : op - 0x2;
             bool carryUsed;
             OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
-            TEST(32, shifted, shifted);
+            if (FlagsNZRequired())
+                TEST(32, shifted, shifted);
             MOV(32, rd, shifted);
             Comp_RetriveFlags(false, false, true);
         }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index ab13cb6..6abb2bb 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -342,6 +342,11 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 };
 #undef F
 
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
 void Compiler::Reset()
 {
     memset(ResetStart, 0xcc, CodeMemSize);
@@ -380,11 +385,15 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
+    printf("block start %d\n", Thumb);
+
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
+        printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 3151cbc..8861884 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -29,6 +29,8 @@ public:
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
 
+    bool CanCompile(bool thumb, u16 kind);
+
     typedef void (Compiler::*CompileFunc)();
 
     void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
@@ -64,7 +66,6 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
-
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
@@ -121,6 +122,9 @@ public:
     void LoadCPSR();
     void SaveCPSR();
 
+    bool FlagsNZRequired()
+    { return CurInstr.SetFlags & 0xC; }
+
     Gen::FixupBranch CheckCondition(u32 cond);
 
     Gen::OpArg MapReg(int reg)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 4813799..ea6d827 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 13)
+#define ak(x) ((x) << 18)
 
 enum {
     A_Read0             = 1 << 0,
@@ -26,69 +26,81 @@ enum {
     A_Link              = 1 << 10,
 
     A_UnkOnARM7         = 1 << 11,
+
+    A_SetNZ             = 1 << 12,
+    A_SetCV             = 1 << 13,
+    A_SetMaybeC         = 1 << 14,
+    A_MulFlags          = 1 << 15,
+    A_ReadC             = 1 << 16,
+    A_RRXReadC          = 1 << 17,
 };
 
 #define A_BIOP A_Read16
 #define A_MONOOP 0
 
-#define A_IMPLEMENT_ALU_OP(x,k) \
-    const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+#define A_ARITH A_SetCV
+#define A_LOGIC A_SetMaybeC
+#define A_ARITH_IMM A_SetCV
+#define A_LOGIC_IMM 0
+
+#define A_IMPLEMENT_ALU_OP(x,k,a,c) \
+    const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
     \
-    const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
-    const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
-    const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
-    const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
-    const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
-    const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
-    const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
-    const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
-    const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
-
-A_IMPLEMENT_ALU_OP(AND,BIOP)
-A_IMPLEMENT_ALU_OP(EOR,BIOP)
-A_IMPLEMENT_ALU_OP(SUB,BIOP)
-A_IMPLEMENT_ALU_OP(RSB,BIOP)
-A_IMPLEMENT_ALU_OP(ADD,BIOP)
-A_IMPLEMENT_ALU_OP(ADC,BIOP)
-A_IMPLEMENT_ALU_OP(SBC,BIOP)
-A_IMPLEMENT_ALU_OP(RSC,BIOP)
-A_IMPLEMENT_ALU_OP(ORR,BIOP)
-A_IMPLEMENT_ALU_OP(MOV,MONOOP)
-A_IMPLEMENT_ALU_OP(BIC,BIOP)
-A_IMPLEMENT_ALU_OP(MVN,MONOOP)
+    const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0)
 
 const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
 
-#define A_IMPLEMENT_ALU_TEST(x) \
-    const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
-
-A_IMPLEMENT_ALU_TEST(TST)
-A_IMPLEMENT_ALU_TEST(TEQ)
-A_IMPLEMENT_ALU_TEST(CMP)
-A_IMPLEMENT_ALU_TEST(CMN)
-
-const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
-const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
-const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
-const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
-const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
-const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+#define A_IMPLEMENT_ALU_TEST(x,a) \
+    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST,LOGIC)
+A_IMPLEMENT_ALU_TEST(TEQ,LOGIC)
+A_IMPLEMENT_ALU_TEST(CMP,ARITH)
+A_IMPLEMENT_ALU_TEST(CMN,ARITH)
+
+const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
 const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
@@ -161,7 +173,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 16)
+#define tk(x) ((x) << 20)
 
 enum {
     T_Read0         = 1 << 0,
@@ -183,42 +195,47 @@ enum {
     T_ReadR14       = 1 << 13,
     T_WriteR14      = 1 << 14,
 
-    T_PopPC         = 1 << 15
+    T_PopPC         = 1 << 15,
+
+    T_SetNZ         = 1 << 16,
+    T_SetCV         = 1 << 17,
+    T_SetMaybeC     = 1 << 18,
+    T_ReadC         = 1 << 19
 };
 
-const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM);
-const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM);
-const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM);
-
-const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
-const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
-const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
-const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
-
-const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM);
-const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM);
-const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM);
-const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM);
-
-const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
-const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
-const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
-const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
-const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
-const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
-const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
-const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
-const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG);
-const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG);
-const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG);
-const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG);
-const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
-const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
-const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
-const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG);
+const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG);
 
 const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
-const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
 const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
 const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
@@ -268,10 +285,20 @@ const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC);
 
 Info Decode(bool thumb, u32 num, u32 instr)
 {
+    const u8 FlagsReadPerCond[7] = {
+        flag_Z,
+        flag_C,
+        flag_N,
+        flag_V,
+        flag_C | flag_Z,
+        flag_N | flag_V,
+        flag_Z | flag_N | flag_V};
+
     Info res = {0};
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+        res.Kind = (data >> 20) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -309,7 +336,18 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_PopPC && instr & (1 << 8))
             res.DstRegs |= 1 << 15;
 
-        res.Kind = (data >> 16) & 0x3F;
+        if (data & T_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & T_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & T_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if (data & T_ReadC)
+            res.ReadFlags |= flag_C;
+
+        if (res.Kind == tk_BCOND)
+            res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7];
+
         res.EndBlock = res.Branches();
 
         return res;
@@ -323,7 +361,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 13) & 0x1FF;
+        res.Kind = (data >> 18) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -382,6 +420,26 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        if (data & A_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & A_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if ((data & A_MulFlags) && (instr & (1 << 20)))
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_ReadC)
+            res.ReadFlags |= flag_C;
+        if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
+            res.ReadFlags |= flag_C;
+
+        if ((instr >> 28) < 0xE)
+        {
+            // make non conditional flag sets conditional
+            res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4);
+            res.ReadFlags |= FlagsReadPerCond[instr >> 29];
+        }
+
         res.EndBlock |= res.Branches();
 
         return res;
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 4fe9b10..5336837 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -215,11 +215,24 @@ enum
     tk_Count
 };
 
+enum
+{
+    flag_N = 1 << 3,
+    flag_Z = 1 << 2,
+    flag_C = 1 << 1,
+    flag_V = 1 << 0,
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 ReadFlags;
+    // lower 4 bits - set always
+    // upper 4 bits - might set flag
+    u8 WriteFlags;
+
     bool EndBlock;
     bool Branches()
     {
-- 
cgit v1.2.3


From d208f5909c97c3caeaa4c1c95a37e618824ec199 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 13:06:27 +0200
Subject: fixes for flag optimisation

---
 src/ARMJIT.cpp                | 1 +
 src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +-
 src/ARM_InstrInfo.cpp         | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 3b6bc2e..5d92e47 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -163,6 +163,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
 
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 6a7d711..f868ddf 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -387,7 +387,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
         AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
         OR(32, R(RCPSR), R(RSCRATCH));
     }
-    else
+    else if (carryUsed || retriveCV)
     {
         SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28));
         AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index ea6d827..3634c35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -436,7 +436,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
-            res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4);
+            res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0;
             res.ReadFlags |= FlagsReadPerCond[instr >> 29];
         }
 
-- 
cgit v1.2.3


From d57ee718ba056f7b5ec1bcebc6c6c79f1fe26993 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 25 Aug 2019 13:09:03 +0200
Subject: remove debug printing

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 6abb2bb..5e05446 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -385,15 +385,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
-    printf("block start %d\n", Thumb);
-
     for (int i = 0; i < instrsCount; i++)
     {
         R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
 
-        printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags);
-
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
-- 
cgit v1.2.3


From 85680d6fe56a701d85f7312766764615fab2f012 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 8 Sep 2019 14:09:00 +0200
Subject: more fixes for flag optimisation + small cycle counting optimisation

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  4 ++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 28 ++++++++---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  4 ++
 src/ARM_InstrInfo.cpp               | 92 ++++++++++++++++++++++---------------
 5 files changed, 86 insertions(+), 44 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 30b18d7..c0a8f1f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -19,6 +19,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     // it's not completely safe to assume stuff like, which instructions to preload
     // we'll see how it works out
 
+    IrregularCycles = true;
+
     u32 newPC;
     u32 cycles = 0;
 
@@ -140,6 +142,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
+    IrregularCycles = true;
+
     BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 5e05446..d585f39 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -447,6 +447,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 Comp_AddCycles_C();
             else
             {
+                IrregularCycles = false;
+
                 FixupBranch skipExecute;
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
@@ -463,13 +465,19 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
                 if (CurInstr.Cond() < 0xE)
                 {
-                    FixupBranch skipFailed = J();
-                    SetJumpTarget(skipExecute);
+                    if (IrregularCycles)
+                    {
+                        FixupBranch skipFailed = J();
+                        SetJumpTarget(skipExecute);
 
-                    Comp_AddCycles_C();
+                        Comp_AddCycles_C(true);
 
-                    SetJumpTarget(skipFailed);
+                        SetJumpTarget(skipFailed);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
                 }
+                
             }
         }
 
@@ -518,8 +526,16 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
     
-    LEA(32, RSCRATCH, MDisp(i, add + cycles));
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+    if (!Thumb && CurInstr.Cond() < 0xE)
+    {
+        LEA(32, RSCRATCH, MDisp(i, add + cycles));
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+    }
+    else
+    {
+        ConstantCycles += i + cycles;
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+    }
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 8861884..a62f043 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -139,6 +139,8 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool IrregularCycles;
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 3b4cb7d..bf8280d 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -438,6 +438,8 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 
 void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 {
+    IrregularCycles = true;
+
     if (store)
         MOV(32, R(ABI_PARAM2), rd);
     u32 cycles = Num
@@ -459,6 +461,8 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
+    IrregularCycles = true;
+
     int regsCount = regs.Count();
 
     if (decrement)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 3634c35..9239e29 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 18)
+#define ak(x) ((x) << 21)
 
 enum {
     A_Read0             = 1 << 0,
@@ -33,13 +33,21 @@ enum {
     A_MulFlags          = 1 << 15,
     A_ReadC             = 1 << 16,
     A_RRXReadC          = 1 << 17,
+    A_StaticShiftSetC   = 1 << 18,
+    A_SetC              = 1 << 19,
+
+    A_WriteMemory       = 1 << 20,
 };
 
 #define A_BIOP A_Read16
 #define A_MONOOP 0
 
-#define A_ARITH A_SetCV
-#define A_LOGIC A_SetMaybeC
+#define A_ARITH_LSL_IMM A_SetCV
+#define A_LOGIC_LSL_IMM A_StaticShiftSetC
+#define A_ARITH_SHIFT_IMM A_SetCV
+#define A_LOGIC_SHIFT_IMM A_SetC
+#define A_ARITH_SHIFT_REG A_SetCV
+#define A_LOGIC_SHIFT_REG A_SetMaybeC
 #define A_ARITH_IMM A_SetCV
 #define A_LOGIC_IMM 0
 
@@ -55,14 +63,14 @@ enum {
     const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
     \
     const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
-    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
-    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
-    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
-    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
-    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
-    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
-    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
-    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a##_LSL_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
 
 A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0)
 A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0)
@@ -80,15 +88,15 @@ A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0)
 const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
 
 #define A_IMPLEMENT_ALU_TEST(x,a) \
-    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \
-    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
-    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
-    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
-    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
-    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
-    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
-    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
-    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a##_IMM | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a##_LSL_IMM | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
 
 A_IMPLEMENT_ALU_TEST(TST,LOGIC)
 A_IMPLEMENT_ALU_TEST(TEQ,LOGIC)
@@ -115,20 +123,20 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12
+#define A_STR A_Read12 | A_WriteMemory
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
     const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
     const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
     const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
-    const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
     \
     const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
     const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
     const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
     const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
-    const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
 
 A_IMPLEMENT_WB_LDRSTR(STR,STR)
 A_IMPLEMENT_WB_LDRSTR(STRB,STR)
@@ -136,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double
+#define A_STRD A_Read12Double | A_WriteMemory
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -151,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -173,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 20)
+#define tk(x) ((x) << 21)
 
 enum {
     T_Read0         = 1 << 0,
@@ -200,12 +208,13 @@ enum {
     T_SetNZ         = 1 << 16,
     T_SetCV         = 1 << 17,
     T_SetMaybeC     = 1 << 18,
-    T_ReadC         = 1 << 19
+    T_ReadC         = 1 << 19,
+    T_SetC          = 1 << 20,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
-const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
-const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+const u32 T_LSR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
 
 const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
 const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
@@ -213,7 +222,7 @@ const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
 const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
 
 const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM);
-const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM);
+const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Read8 | tk(tk_CMP_IMM);
 const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM);
 const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM);
 
@@ -240,7 +249,7 @@ const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
 
 const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
-const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP);
+const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
@@ -298,7 +307,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 20) & 0x3F;
+        res.Kind = (data >> 21) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -344,12 +353,14 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.WriteFlags |= flag_C << 4;
         if (data & T_ReadC)
             res.ReadFlags |= flag_C;
+        if (data & T_SetC)
+            res.WriteFlags |= flag_C;
+
+        res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
             res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7];
 
-        res.EndBlock = res.Branches();
-
         return res;
     }
     else
@@ -361,7 +372,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 18) & 0x1FF;
+        res.Kind = (data >> 21) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -369,7 +380,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 cm = instr & 0xF;
             u32 cpinfo = (instr >> 5) & 0x7;
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
-            if (id == 0x704 || id == 0x782)
+            if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
@@ -420,6 +431,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ak_LDM)
             res.DstRegs |= instr & (1 << 15); // this is right
 
+        if (res.Kind == ak_STM)
+            res.SrcRegs |= instr & (1 << 15);
+
         if (data & A_SetNZ)
             res.WriteFlags |= flag_N | flag_Z;
         if (data & A_SetCV)
@@ -432,6 +446,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.ReadFlags |= flag_C;
         if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
             res.ReadFlags |= flag_C;
+        if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
+            res.WriteFlags |= flag_C;
 
         if ((instr >> 28) < 0xE)
         {
-- 
cgit v1.2.3


From 0e26aa4edeafa0dab57d6e5a1b77e1a80c6ae3c4 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 8 Sep 2019 14:48:20 +0200
Subject: load register only if needed - do thumb bl long merge in the first
 step - preparations for better branch jitting

---
 src/ARMJIT.cpp                     | 16 ++++++++++++++++
 src/ARMJIT.h                       |  1 +
 src/ARMJIT_RegisterCache.h         | 12 ++++++++----
 src/ARMJIT_x64/ARMJIT_Branch.cpp   | 12 +++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 34 ++++++++++++----------------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  2 +-
 src/ARM_InstrInfo.h                |  3 +++
 7 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 5d92e47..85cadf3 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -159,6 +159,7 @@ CompiledBlock CompileBlock(ARM* cpu)
     u32 r15 = cpu->R[15];
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+	u32 nextInstrAddr[2] = {blockAddr, r15};
     do
     {
         r15 += thumb ? 2 : 4;
@@ -166,6 +167,10 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+	
+		instrs[i].Addr = nextInstrAddr[0];
+		nextInstrAddr[0] = nextInstrAddr[1];
+		nextInstrAddr[1] = r15;
 
         if (cpu->Num == 0)
         {
@@ -193,8 +198,19 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
+			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
+		{
+			instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG;
+			instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16);
+			instrs[i - 1].Info.DstRegs = 0xC000;
+			instrs[i - 1].Info.SrcRegs = 0;
+			instrs[i - 1].Info.EndBlock = true;
+			i--;
+		}
         i++;
 
+
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
 		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
 			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 6197695..7e448ef 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -31,6 +31,7 @@ struct FetchedInstr
 	u8 SetFlags;
     u32 Instr;
     u32 NextInstr[2];
+	u32 Addr;
 
     u8 CodeCycles;
 
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 04c1eda..fe2f203 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -38,7 +38,7 @@ public:
         Mapping[reg] = (Reg)-1;
     }
 
-    void LoadRegister(int reg)
+    void LoadRegister(int reg, bool loadValue)
     {
         assert(Mapping[reg] == -1);
         for (int i = 0; i < NativeRegsAvailable; i++)
@@ -50,7 +50,8 @@ public:
                 NativeRegsUsed |= 1 << (int)nativeReg;
                 LoadedRegs |= 1 << reg;
 
-                Compiler->LoadReg(reg, nativeReg);
+                if (loadValue)
+                    Compiler->LoadReg(reg, nativeReg);
 
                 return;
             }
@@ -66,7 +67,7 @@ public:
             UnloadRegister(reg);
     }
 
-	void Prepare(int i)
+	void Prepare(bool thumb, int i)
     {
         u16 futureNeeded = 0;
         int ranking[16];
@@ -111,8 +112,11 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            BitSet16 needValueLoaded(needToBeLoaded);
+            if (thumb || Instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
-                LoadRegister(reg);
+                LoadRegister(reg, needValueLoaded[reg]);
         }
         DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index c0a8f1f..cc7a3c4 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -271,15 +271,17 @@ void Compiler::T_Comp_BL_LONG_2()
     Comp_JumpTo(RSCRATCH);
 }
 
-void Compiler::T_Comp_BL_Merged(FetchedInstr part1)
+void Compiler::T_Comp_BL_Merged()
 {
-    assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1);
     Comp_AddCycles_C();
 
-    u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9);
-    target += (CurInstr.Instr & 0x7FF) << 1;
+    R15 += 2;
 
-    if (Num == 1 || CurInstr.Instr & (1 << 12))
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
         target |= 1;
 
     MOV(32, MapReg(14), Imm32((R15 - 2) | 1));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d585f39..d8ce1aa 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -338,7 +338,8 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
     // Branch
     F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
     // Unk, SVC
-    NULL, NULL
+    NULL, NULL,
+    F(T_Comp_BL_Merged)
 };
 #undef F
 
@@ -361,21 +362,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ConstantCycles = 0;
     Thumb = cpu->CPSR & 0x20;
     Num = cpu->Num;
-    R15 = cpu->R[15];
     CodeRegion = cpu->CodeRegion;
     CurCPU = cpu;
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
     if (!(Num == 0 
-        ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(R15 - (Thumb ? 2 : 4))))
+        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
+        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
     {
         printf("Trying to compile a block in unmapped memory\n");
     }
 
-    bool mergedThumbBL = false;
-
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
@@ -387,8 +385,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     for (int i = 0; i < instrsCount; i++)
     {
-        R15 += Thumb ? 2 : 4;
         CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
@@ -406,29 +404,21 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         }
         
         if (comp != NULL)
-            RegCache.Prepare(i);
+            RegCache.Prepare(Thumb, i);
         else
             RegCache.Flush();
 
         if (Thumb)
         {
-            if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1
-                && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2)
-                mergedThumbBL = true;
-            else
+            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
+            if (comp == NULL)
             {
-                u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
-                if (comp == NULL)
-                {
-                    MOV(64, R(ABI_PARAM1), R(RCPU));
+                MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
-                }
-                else if (mergedThumbBL)
-                    T_Comp_BL_Merged(instrs[i - 1]);
-                else
-                    (this->*comp)();
+                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
             }
+            else
+                (this->*comp)();
         }
         else
         {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a62f043..fcb2380 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -90,7 +90,7 @@ public:
     void T_Comp_BranchXchangeReg();
     void T_Comp_BL_LONG_1();
     void T_Comp_BL_LONG_2();
-    void T_Comp_BL_Merged(FetchedInstr prefix);
+    void T_Comp_BL_Merged();
 
     void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 5336837..d01c600 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -212,6 +212,9 @@ enum
     tk_UNK,
     tk_SVC,
 
+    // not a real instruction
+    tk_BL_LONG,
+
     tk_Count
 };
 
-- 
cgit v1.2.3


From 40b88ab05aeb7e5c5216f29f4004fb5797db04b5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:10:59 +0200
Subject: new block cache and much more... - more reliable code invalidation
 detection - blocks aren't stopped at any branch, but are being followed if
 possible to get larger blocks - idle loop recognition - optimised literal
 loads, load/store cycle counting  and loads/stores from constant addresses

---
 src/ARM.cpp                         |  44 ++-
 src/ARM.h                           |  16 +-
 src/ARMInterpreter.h                |   9 +
 src/ARMJIT.cpp                      | 755 ++++++++++++++++++++++++++++++------
 src/ARMJIT.h                        | 141 ++-----
 src/ARMJIT_Internal.h               | 198 ++++++++++
 src/ARMJIT_RegisterCache.h          |  36 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  16 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 184 +++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  51 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++----------------
 src/ARM_InstrInfo.cpp               |  47 ++-
 src/ARM_InstrInfo.h                 |  11 +-
 src/CP15.cpp                        |  12 +-
 src/Config.cpp                      |   2 +
 src/Config.h                        |   1 +
 src/NDS.cpp                         |  22 +-
 18 files changed, 1529 insertions(+), 688 deletions(-)
 create mode 100644 src/ARMJIT_Internal.h

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 7caef75..1e75301 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -623,21 +623,26 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
 
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
             {
                 NDS::ARM9Timestamp = NDS::ARM9Target;
             }
             break;
         }
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -753,23 +758,28 @@ void ARMv4::ExecuteJIT()
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
 
         // TODO optimize this shit!!!
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
             {
                 NDS::ARM7Timestamp = NDS::ARM7Target;
             }
             break;
         }
-
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -779,6 +789,8 @@ void ARMv4::ExecuteJIT()
 
 void ARMv5::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         if ((R[15] - 2) & 0x2)
@@ -801,6 +813,8 @@ void ARMv5::FillPipeline()
 
 void ARMv4::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         NextInstr[0] = CodeRead16(R[15] - 2);
diff --git a/src/ARM.h b/src/ARM.h
index 811b2e0..b36120a 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -311,7 +311,7 @@ public:
     {
         *val = BusRead8(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead16(u32 addr, u32* val)
@@ -320,7 +320,7 @@ public:
 
         *val = BusRead16(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead32(u32 addr, u32* val)
@@ -329,7 +329,7 @@ public:
 
         *val = BusRead32(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataRead32S(u32 addr, u32* val)
@@ -337,14 +337,14 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
     void DataWrite8(u32 addr, u8 val)
     {
         BusWrite8(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite16(u32 addr, u16 val)
@@ -353,7 +353,7 @@ public:
 
         BusWrite16(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite32(u32 addr, u32 val)
@@ -362,7 +362,7 @@ public:
 
         BusWrite32(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataWrite32S(u32 addr, u32 val)
@@ -370,7 +370,7 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
 
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index 7244238..2bf8167 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -28,6 +28,15 @@ namespace ARMInterpreter
 extern void (*ARMInstrTable[4096])(ARM* cpu);
 extern void (*THUMBInstrTable[1024])(ARM* cpu);
 
+void A_MSR_IMM(ARM* cpu);
+void A_MSR_REG(ARM* cpu);
+void A_MRS(ARM* cpu);
+void A_MCR(ARM* cpu);
+void A_MRC(ARM* cpu);
+void A_SVC(ARM* cpu);
+
+void T_SVC(ARM* cpu);
+
 void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
 
 }
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 85cadf3..686bdd6 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,122 +1,137 @@
 #include "ARMJIT.h"
 
 #include <string.h>
+#include <assert.h>
 
 #include "Config.h"
 
+#include "ARMJIT_Internal.h"
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
+#include "ARMInterpreter_ALU.h"
+#include "ARMInterpreter_LoadStore.h"
+#include "ARMInterpreter_Branch.h"
+#include "ARMInterpreter.h"
+
+#include "GPU3D.h"
+#include "SPU.h"
+#include "Wifi.h"
+
 namespace ARMJIT
 {
 
+#define JIT_DEBUGPRINT(msg, ...)
+
 Compiler* compiler;
-BlockCache cache;
 
-#define DUP2(x) x, x
+const u32 ExeMemRegionSizes[] = {
+	0x8000,			// Unmapped Region (dummy)
+	0x8000, 		// ITCM
+	4*1024*1024, 	// Main RAM
+	0x8000, 		// SWRAM
+	0xA4000, 		// LCDC
+	0x8000, 		// ARM9 BIOS
+	0x4000, 		// ARM7 BIOS
+	0x10000,		// ARM7 WRAM
+	0x40000			// ARM7 WVRAM
+};
 
-static ptrdiff_t JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
-		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/		 -1, 
-					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
-		/* 1X*/	DUP2(-1),
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	     offsetof(BlockCache, SWRAM),
-		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(-1)
-		}
+const u32 ExeMemRegionOffsets[] = {
+	0,
+	0x8000,
+	0x10000,
+	0x410000,
+	0x418000,
+	0x4BC000,
+	0x4C4000,
+	0x4C8000,
+	0x4D8000,
+	0x518000,
 };
 
-static u32 JIT_MASK[2][32] = {
+#define DUP2(x) x, x
+
+const static ExeMemKind JIT_MEM[2][32] = {
 	//arm9
 	{
-		/* 0X*/	DUP2(0x00007FFF),
-		/* 1X*/	DUP2(0x00007FFF),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	DUP2(0x00007FFF),
-		/* 4X*/	DUP2(0x00000000),
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/		 0x00000000,
-					 0x000FFFFF,
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00007FFF)
+		/* 0X*/	DUP2(exeMem_ITCM),
+		/* 1X*/	DUP2(exeMem_ITCM), // mirror
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	DUP2(exeMem_SWRAM),
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/		 exeMem_Unmapped, 
+					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_ARM9_BIOS)
 	},
 	//arm7
 	{
-		/* 0X*/	DUP2(0x00003FFF),
-		/* 1X*/	DUP2(0x00000000),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	     0x00007FFF,
-		             0x0000FFFF,
-		/* 4X*/	     0x00000000,
-		             0x0000FFFF,
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/ DUP2(0x0003FFFF),
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00000000)
+		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
+		/* 1X*/	DUP2(exeMem_Unmapped),
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	     exeMem_SWRAM,
+		             exeMem_ARM7_WRAM,
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_Unmapped)
 		}
 };
 
 #undef DUP2
 
+/*
+	translates address to pseudo physical address
+		- more compact, eliminates mirroring, everything comes in a row
+		- we only need one translation table
+*/
+u32 AddrTranslate9[0x2000];
+u32 AddrTranslate7[0x4000];
 
-void Init()
+JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
+AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+TinyVector<JitBlock*> JitBlocks;
+JitBlock* RestoreCandidates[0x1000] = {NULL};
+
+u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
 {
-    memset(&cache, 0, sizeof(BlockCache));
+	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
+}
 
+void Init()
+{
 	for (int i = 0; i < 0x2000; i++)
-		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
-			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[0][i >> 8];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
+	}
 	for (int i = 0; i < 0x4000; i++)
-		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
-			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[1][i >> 9];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
+	}
 
 	compiler = new Compiler();
 }
@@ -126,7 +141,7 @@ void DeInit()
 	delete compiler;
 }
 
-void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 {
 	for (int j = start; j >= 0; j--)
 	{
@@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+{
+	if (thumb)
+	{
+		u32 r15 = instr.Addr + 4;
+		cond = 0xE;
+
+		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
+		{
+			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
+    		targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_B)
+		{
+			s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
+		{
+			cond = (instr.Instr >> 8) & 0xF;
+			s32 offset = (s32)(instr.Instr << 24) >> 23;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	else
+	{
+		cond = instr.Cond();
+		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
+			|| instr.Info.Kind == ARMInstrInfo::ak_B)
+		{
+			s32 offset = (s32)(instr.Instr << 8) >> 6;
+			u32 r15 = instr.Addr + 8;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
+{
+	// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
+	// it basically checks if one iteration of a loop depends on another
+	// the rules are quite simple
+
+	u16 regsWrittenTo = 0;
+	u16 regsDisallowedToWrite = 0;
+	for (int i = 0; i < instrsCount; i++)
+	{
+		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
+			return false;
+		if (i < instrsCount - 1 && instrs[i].Info.Branches())
+			return false;
+
+		u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
+		u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
+
+		regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
+		
+		if (dstRegs & regsDisallowedToWrite)
+			return false;
+		regsWrittenTo |= dstRegs;
+	}
+	return true;
+}
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+
+#define F(x) &ARMInterpreter::A_##x
+#define F_ALU(name, s) \
+	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
+	F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
+#define F_MEM_WB(name) \
+	F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
+	F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
+#define F_MEM_HD(name) \
+	F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
+InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
+{
+	F_ALU(AND,), F_ALU(AND,_S),
+	F_ALU(EOR,), F_ALU(EOR,_S),
+	F_ALU(SUB,), F_ALU(SUB,_S),
+	F_ALU(RSB,), F_ALU(RSB,_S),
+	F_ALU(ADD,), F_ALU(ADD,_S),
+	F_ALU(ADC,), F_ALU(ADC,_S),
+	F_ALU(SBC,), F_ALU(SBC,_S),
+	F_ALU(RSC,), F_ALU(RSC,_S),
+	F_ALU(ORR,), F_ALU(ORR,_S),
+	F_ALU(MOV,), F_ALU(MOV,_S),
+	F_ALU(BIC,), F_ALU(BIC,_S),
+	F_ALU(MVN,), F_ALU(MVN,_S),
+	F_ALU(TST,),
+	F_ALU(TEQ,),
+	F_ALU(CMP,),
+	F_ALU(CMN,),
+
+	F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
+	F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
+
+	F_MEM_WB(STR),
+	F_MEM_WB(STRB),
+	F_MEM_WB(LDR),
+	F_MEM_WB(LDRB),
+
+	F_MEM_HD(STRH),
+	F_MEM_HD(LDRD),
+	F_MEM_HD(STRD),
+	F_MEM_HD(LDRH),
+	F_MEM_HD(LDRSB),
+	F_MEM_HD(LDRSH),
+
+	F(SWP), F(SWPB),
+	F(LDM), F(STM),
+
+	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+};
+#undef F_ALU
+#undef F_MEM_WB
+#undef F_MEM_HD
+#undef F
+
+#define F(x) ARMInterpreter::T_##x
+InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
+{
+	F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
+	F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
+	F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
+	F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
+	F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
+	F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
+	F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
+	F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
+	F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
+	F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
+	F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
+	F(PUSH), F(POP), F(LDMIA), F(STMIA),
+	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
+	F(UNK), F(SVC), 
+	NULL // BL_LONG psudo opcode
+};
+#undef F
+
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu)
 	if (Config::JIT_MaxBlockSize > 32)
 		Config::JIT_MaxBlockSize = 32;
 
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
+    if (!(cpu->Num == 0 
+        ? IsMapped<0>(blockAddr)
+        : IsMapped<1>(blockAddr)))
+    {
+        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
+    }
+	
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr<0>(blockAddr)
+			: TranslateAddr<1>(blockAddr);
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+
+	u32 addresseRanges[32] = {};
+	u32 numAddressRanges = 0;
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
+
+	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
+		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+
+	u32 lastSegmentStart = blockAddr;
+
     do
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
@@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
 		nextInstrAddr[1] = r15;
+		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
+
+		u32 translatedAddr = (cpu->Num == 0
+			? TranslateAddr<0>(instrs[i].Addr)
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		{
+			bool returning = false;
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (addresseRanges[j] == translatedAddr)
+				{
+					returning = true;
+					break;
+				}
+			}
+			if (!returning)
+				addresseRanges[numAddressRanges++] = translatedAddr;
+		}
 
         if (cpu->Num == 0)
         {
@@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		cpu->R[15] = r15;
+		cpu->CurInstr = instrs[i].Instr;
+		cpu->CodeCycles = instrs[i].CodeCycles;
+
+		if (thumb)
+		{
+			InterpretTHUMB[instrs[i].Info.Kind](cpu);
+		}
+		else
+		{
+			if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+			{
+				ARMInterpreter::A_BLX_IMM(cpu);
+			}
+			else
+			{
+                u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				if (cpu->CheckCondition(instrs[i].Cond()))
+					InterpretARM[instrs[i].Info.Kind](cpu);
+				else
+					cpu->AddCycles_C();
+			}
+		}
+
+		instrs[i].DataCycles = cpu->DataCycles;
+		instrs[i].DataRegion = cpu->DataRegion;
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu)
 			instrs[i - 1].Info.EndBlock = true;
 			i--;
 		}
-        i++;
 
+		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		{
+			bool hasBranched = cpu->R[15] != r15;
+
+			u32 cond, target;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
+
+			if (staticBranch)
+			{
+				bool isBackJump = false;
+				if (hasBranched)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						if (instrs[i].Addr == target)
+						{
+							isBackJump = true;
+							break;
+						}
+					}
+				}
+
+				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
+				{
+					// we might have an idle loop
+					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
+					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					{
+						instrs[i].BranchFlags |= branch_IdleBranch;
+						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
+					}
+				}
+				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				{
+					u32 targetPseudoPhysical = cpu->Num == 0
+						? TranslateAddr<0>(target)
+						: TranslateAddr<1>(target);
+					
+					r15 = target + (thumb ? 2 : 4);
+					assert(r15 == cpu->R[15]);
+
+					JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
+
+					nextInstr[0] = cpu->NextInstr[0];
+					nextInstr[1] = cpu->NextInstr[1];
+
+					nextInstrAddr[0] = target;
+					nextInstrAddr[1] = r15;
+
+					lastSegmentStart = target;
+
+					instrs[i].Info.EndBlock = false;
+
+					if (cond < 0xE)
+						instrs[i].BranchFlags |= branch_FollowCondTaken;
+				}
+			}
+
+			if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
+			{
+				instrs[i].Info.EndBlock = false;
+				instrs[i].BranchFlags |= branch_FollowCondNotTaken;
+			}
+		}
+
+        i++;
 
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
-		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
-			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
+		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
+		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
+			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
 
-	floodFillSetFlags(instrs, i - 1, 0xF);
+	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	bool mayRestore = true;
+	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	{
+		RestoreCandidates[restoreSlot] = NULL;	
+		if (prevBlock->NumInstrs == i)
+		{
+			for (int j = 0; j < i; j++)
+			{
+				if (prevBlock->Instrs()[j] != instrs[j].Instr)
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
 
-    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+		if (prevBlock->NumAddresses == numAddressRanges)
+		{
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
+	}
+	else
+	{
+		mayRestore = false;
+		prevBlock = NULL;
+	}
 
-	if (cpu->Num == 0)
-    	InsertBlock<0>(blockAddr, block);
+	JitBlock* block;
+	if (!mayRestore)
+	{
+		if (prevBlock)
+			delete prevBlock;
+
+		block = new JitBlock(i, numAddressRanges);
+		for (int j = 0; j < i; j++)
+			block->Instrs()[j] = instrs[j].Instr;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addresseRanges[j];
+
+		block->StartAddr = blockAddr;
+		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+
+		FloodFillSetFlags(instrs, i - 1, 0xF);
+
+		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+	}
 	else
-    	InsertBlock<1>(blockAddr, block);
+	{
+		JIT_DEBUGPRINT("restored! %p\n", prevBlock);
+		block = prevBlock;
+	}
+
+	for (int j = 0; j < numAddressRanges; j++)
+	{
+		assert(addresseRanges[j] == block->AddressRanges()[j]);
+		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+	}
+
+	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
 
-	return block;
+	JitBlocks.Add(block);
 }
 
-void InvalidateBlockCache()
+void InvalidateByAddr(u32 pseudoPhysical)
 {
-	printf("Resetting JIT block cache...\n");
+	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	int startLength = range->Blocks.Length;
+	for (int i = 0; i < range->Blocks.Length; i++)
+	{
+		assert(range->Blocks.Length == startLength);
+		JitBlock* block = range->Blocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			if ((addr / 256) != (pseudoPhysical / 256))
+			{
+				AddressRange* otherRange = &CodeRanges[addr / 256];
+				assert(otherRange != range);
+				assert(otherRange->Blocks.RemoveByValue(block));
+			}
+		}
+
+		assert(JitBlocks.RemoveByValue(block));
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+
+		RestoreCandidates[slot] = block;
+	}
+	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
+		range->TimesInvalidated++;
+	
+	range->Blocks.Clear();
+}
+
+void InvalidateByAddr7(u32 addr)
+{
+	u32 pseudoPhysical = TranslateAddr<1>(addr);
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateITCM(u32 addr)
+{
+	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
+	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateAll()
+{
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeRanges[addr / 256];
+			range->Blocks.Clear();
+			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
+				range->TimesInvalidated++;
+		}
+
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+		
+		RestoreCandidates[slot] = block;
+	}
+
+	JitBlocks.Clear();
+}
+
+void ResetBlockCache()
+{
+	printf("Resetting JIT block cache...\n");
+	
+	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
+	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+	{
+		if (RestoreCandidates[i])
+		{
+			delete RestoreCandidates[i];
+			RestoreCandidates[i] = NULL;
+		}
+	}
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 256].Blocks.Clear();
+			CodeRanges[addr / 256].TimesInvalidated = 0;
+		}
+		delete block;
+	}
+	JitBlocks.Clear();
 
 	compiler->Reset();
 }
 
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		if ((addr & 0xFF000000) == 0x04000000)
+		{
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;		
+			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size == 16)
+			{
+				if (store)
+					return (void*)Wifi::Write;
+				else
+					return (void*)Wifi::Read;
+			}
+			break;
+		}
+	}
+	return NULL;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 7e448ef..1db4d66 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,142 +9,67 @@
 namespace ARMJIT
 {
 
-typedef u32 (*CompiledBlock)();
-
-struct FetchedInstr
+enum ExeMemKind
 {
-    u32 A_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0xF;
-    }
-
-    u32 T_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0x7;
-    }
-
-    u32 Cond() const
-    {
-        return Instr >> 28;
-    }
-
-	u8 SetFlags;
-    u32 Instr;
-    u32 NextInstr[2];
-	u32 Addr;
-
-    u8 CodeCycles;
-
-    ARMInstrInfo::Info Info;
+	exeMem_Unmapped = 0,
+	exeMem_ITCM,
+	exeMem_MainRAM,
+	exeMem_SWRAM,
+	exeMem_LCDC,
+	exeMem_ARM9_BIOS,
+	exeMem_ARM7_BIOS,
+	exeMem_ARM7_WRAM,
+	exeMem_ARM7_WVRAM,
+	exeMem_Count
 };
 
-/* 
-	Copied from DeSmuME
-	Some names where changed to match the nomenclature of melonDS
+extern const u32 ExeMemRegionOffsets[];
+extern const u32 ExeMemRegionSizes[];
 
-	Since it's nowhere explained and atleast I needed some time to get behind it,
-	here's a summary on how it works:
-		more or less all memory locations from which code can be executed are
-		represented by an array of function pointers, which point to null or
-		a function which executes a block instructions starting from there.
+typedef u32 (*JitBlockEntry)();
 
-		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
-		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
-		are the sizes of the smallest contigous memory region mapped to the respective CPU.
-		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
-		we only need every second half word to be adressable.
+extern u32 AddrTranslate9[0x2000];
+extern u32 AddrTranslate7[0x4000];
 
-		In case a memory write hits mapped memory, the function block at this
-		address is set to null, so it's recompiled the next time it's executed.
-
-		This method has disadvantages, namely that only writing to the
-		first instruction of a block marks it as invalid and that memory remapping
-        (SWRAM and VRAM) isn't taken into account.
-*/
-
-struct BlockCache
-{
-    CompiledBlock* AddrMapping9[0x2000] = {0};
-    CompiledBlock* AddrMapping7[0x4000] = {0};
-
-    CompiledBlock MainRAM[4*1024*1024/2];
-	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
-	CompiledBlock ARM9_ITCM[0x8000/2];
-	CompiledBlock ARM9_LCDC[0xA4000/2];
-	CompiledBlock ARM9_BIOS[0x8000/2];
-	CompiledBlock ARM7_BIOS[0x4000/2];
-	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
-};
-
-extern BlockCache cache;
+const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
+extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
 }
 
 template <u32 num>
-inline CompiledBlock LookUpBlock(u32 addr)
+inline u32 TranslateAddr(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
 template <u32 num>
-inline void Invalidate16(u32 addr)
+inline JitBlockEntry LookUpBlock(u32 addr)
 {
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
-		else
-			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
-	}
-}
-
-template <u32 num>
-inline void Invalidate32(u32 addr)
-{
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-		{
-			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
-			page[(addr & 0x7FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
-		}
-		else
-		{
-			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
-			page[(addr & 0x3FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
-		}
-	}
-}
-
-template <u32 num>
-inline void InsertBlock(u32 addr, CompiledBlock func)
-{
-	if (num == 0)
-		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
-	else
-		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
 }
 
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateAll();
+
+void InvalidateITCM(u32 addr);
+void InvalidateByAddr7(u32 addr);
+
+void CompileBlock(ARM* cpu);
 
-void InvalidateBlockCache();
+void ResetBlockCache();
 
 }
 
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
new file mode 100644
index 0000000..4acb488
--- /dev/null
+++ b/src/ARMJIT_Internal.h
@@ -0,0 +1,198 @@
+#ifndef ARMJIT_INTERNAL_H
+#define ARMJIT_INTERNAL_H
+
+#include "types.h"
+#include <stdint.h>
+
+#include "ARMJIT.h"
+
+// here lands everything which doesn't fit into ARMJIT.h
+// where it would be included by pretty much everything
+namespace ARMJIT
+{
+
+enum
+{
+	branch_IdleBranch = 1 << 0,
+	branch_FollowCondTaken = 1 << 1,
+	branch_FollowCondNotTaken = 1 << 2
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+	u8 BranchFlags;
+	u8 SetFlags;
+    u32 Instr;
+    u32 NextInstr[2];
+	u32 Addr;
+
+    u8 CodeCycles;
+	u8 DataCycles;
+	u8 DataRegion;
+
+    ARMInstrInfo::Info Info;
+};
+
+/*
+	TinyVector
+		- because reinventing the wheel is the best!
+	
+	- meant to be used very often, with not so many elements
+	max 1 << 16 elements
+	- doesn't allocate while no elements are inserted
+	- not stl confirmant of course
+	- probably only works with POD types
+	- remove operations don't preserve order, but O(1)!
+*/
+template <typename T>
+struct __attribute__((packed)) TinyVector
+{
+	T* Data = NULL;
+	u16 Capacity = 0;
+	u32 Length = 0; // make it 32 bit so we don't need movzx
+
+	~TinyVector()
+	{
+		delete[] Data;
+	}
+
+	void MakeCapacity(u32 capacity)
+	{
+		assert(capacity <= UINT16_MAX);
+		assert(capacity > Capacity);
+		T* newMem = new T[capacity];
+		if (Data != NULL)
+			memcpy(newMem, Data, sizeof(Data) * Length);
+
+		T* oldData = Data;
+		Data = newMem;
+		if (oldData != NULL)
+			delete[] oldData;
+		
+		Capacity = capacity;
+	}
+
+	void Clear()
+	{
+		Length = 0;
+	}
+
+	void Add(T element)
+	{
+		assert(Length + 1 <= UINT16_MAX);
+		if (Length + 1 > Capacity)
+			MakeCapacity(((Capacity + 4) * 3) / 2);
+		
+		Data[Length++] = element;
+	}
+
+	void Remove(int index)
+	{
+		assert(index >= 0 && index < Length);
+
+		Length--;
+		Data[index] = Data[Length];
+		/*for (int i = index; i < Length; i++)
+			Data[i] = Data[i + 1];*/
+	}
+
+	int Find(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+				return i;
+		}
+		return -1;
+	}
+
+	bool RemoveByValue(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+			{
+				Remove(i);
+				return true;
+			}
+		}
+		return false;
+	}
+
+	T& operator[](int index)
+	{
+		assert(index >= 0 && index < Length);
+		return Data[index];
+	}
+};
+
+class JitBlock
+{
+public:
+	JitBlock(u32 numInstrs, u32 numAddresses)
+	{
+		NumInstrs = numInstrs;
+		NumAddresses = numAddresses;
+		Data = new u32[numInstrs + numAddresses];
+	}
+
+	~JitBlock()
+	{
+		delete[] Data;
+	}
+
+	u32 StartAddr;
+	u32 PseudoPhysicalAddr;
+	
+	u32 NumInstrs;
+	u32 NumAddresses;
+
+	JitBlockEntry EntryPoint;
+
+	u32* Instrs()
+	{ return Data; }
+	u32* AddressRanges()
+	{ return Data + NumInstrs; }
+
+private:
+	/*
+		0..<NumInstrs - the instructions of the block
+		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
+			(atleast one, the pseudo physical address of the block)
+	*/
+	u32* Data;
+};
+
+// size should be 16 bytes because I'm to lazy to use mul and whatnot
+struct __attribute__((packed)) AddressRange
+{
+	TinyVector<JitBlock*> Blocks;
+	u16 TimesInvalidated;
+};
+
+extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+extern InterpreterFunc InterpretARM[];
+extern InterpreterFunc InterpretTHUMB[];
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index fe2f203..ed6a2b7 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -60,15 +60,46 @@ public:
         assert("Welp!");
     }
 
+    void PutLiteral(int reg, u32 val)
+    {
+        LiteralsLoaded |= (1 << reg);
+        LiteralValues[reg] = val;
+    }
+
+    void UnloadLiteral(int reg)
+    {
+        LiteralsLoaded &= ~(1 << reg);
+    }
+
+    bool IsLiteral(int reg)
+    {
+        return LiteralsLoaded & (1 << reg);
+    }
+
+    void PrepareExit()
+    {
+        BitSet16 dirtyRegs(DirtyRegs);
+        for (int reg : dirtyRegs)
+            Compiler->SaveReg(reg, Mapping[reg]);
+    }
+
     void Flush()
     {
         BitSet16 loadedSet(LoadedRegs);
         for (int reg : loadedSet)
             UnloadRegister(reg);
+        LiteralsLoaded = 0;
     }
 
 	void Prepare(bool thumb, int i)
     {
+        if (LoadedRegs & (1 << 15))
+            UnloadRegister(15);
+
+        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        for (int reg : invalidedLiterals)
+            UnloadLiteral(reg);
+
         u16 futureNeeded = 0;
         int ranking[16];
         for (int j = 0; j < 16; j++)
@@ -86,7 +117,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-		FetchedInstr Instr = Instrs[i];
+        FetchedInstr Instr = Instrs[i];
         u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -125,6 +156,9 @@ public:
 	static const int NativeRegsAvailable;
 
 	Reg Mapping[16];
+    u32 LiteralValues[16];
+
+    u16 LiteralsLoaded = 0;
 	u32 NativeRegsUsed = 0;
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f868ddf..14c223b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp()
         MOV(32, rd, op2);
 
     if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
+    {
         NOT(32, rd);
+        if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
+    }
+    else if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
 
     if (S)
     {
@@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_()
     
     Comp_AddCycles_C();
 
-    if (op & 1)
+    // special case for thumb mov being alias to add rd, rn, #0
+    if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
+    {
+        if (rd != rs)
+            MOV(32, rd, rs);
+    }
+    else if (op & 1)
         Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
         Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
@@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU()
     u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
-        Comp_AddCycles_CI(1);
+        Comp_AddCycles_CI(1); // shift by reg
     else
         Comp_AddCycles_C();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cc7a3c4..0dedb3f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -16,9 +16,6 @@ int squeezePointer(T* ptr)
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
     // we can simplify constant branches by a lot
-    // it's not completely safe to assume stuff like, which instructions to preload
-    // we'll see how it works out
-
     IrregularCycles = true;
 
     u32 newPC;
@@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     {
         ARMv5* cpu9 = (ARMv5*)CurCPU;
 
-        u32 oldregion = R15 >> 24;
-        u32 newregion = addr >> 24;
-
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
         u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
-
-        bool setupRegion = newregion != oldregion;
-        if (setupRegion)
-            cpu9->SetupCodeMem(addr);
+        if (Exit)
+            MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
         if (addr & 0x1)
         {
@@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             cycles += cpu9->CodeCycles;
         }
 
-        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
-
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
-        if (setupRegion)
-            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeRegion = codeRegion;
         cpu7->CodeCycles = codeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        if (Exit)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        }
 
         if (addr & 0x1)
         {
@@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = addr >> 15;
     }
 
-    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    if (Exit)
+        MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
@@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
+    Comp_SpecialBranchBehaviour();
+
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+
     Comp_AddCycles_C(true);
-   SetJumpTarget(skipFailed);
+    SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d8ce1aa..25c55a3 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -72,12 +72,15 @@ Compiler::Compiler()
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
-        {
             MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
-            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
-        }
     }
+    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
+    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
+    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
+    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
+    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
+    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
+
     for (int i = 0; i < 2; i++)
         for (int j = 0; j < 2; j++)
         {
@@ -179,12 +182,13 @@ void Compiler::LoadCPSR()
     MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
 }
 
-void Compiler::SaveCPSR()
+void Compiler::SaveCPSR(bool flagClean)
 {
     if (CPSRDirty)
     {
         MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
-        CPSRDirty = false;
+        if (flagClean)
+            CPSRDirty = false;
     }
 }
 
@@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
 // invalidates RSCRATCH and RSCRATCH3
 Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
+    // hack, ldm/stm can get really big TODO: make this better
+    bool ldmStm = !Thumb &&
+        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
     if (cond >= 0x8)
     {
         static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
@@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
         SHL(32, R(RSCRATCH), R(RSCRATCH3));
         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
 
-        return J_CC(CC_Z);
+        return J_CC(CC_Z, ldmStm);
     }
     else
     {
         // could have used a LUT, but then where would be the fun?
         TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
     }
 }
 
@@ -354,25 +361,34 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+void Compiler::Comp_SpecialBranchBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
-        InvalidateBlockCache();
+        ResetBlockCache();
 
     ConstantCycles = 0;
-    Thumb = cpu->CPSR & 0x20;
+    Thumb = thumb;
     Num = cpu->Num;
-    CodeRegion = cpu->CodeRegion;
+    CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
-    if (!(Num == 0 
-        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
-    {
-        printf("Trying to compile a block in unmapped memory\n");
-    }
+    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
@@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
-    // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
+        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
         bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
-        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-
             if (comp == NULL)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
+
                 SaveCPSR();
+            }
         }
-        
+
         if (comp != NULL)
             RegCache.Prepare(Thumb, i);
         else
@@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
             }
             else
                 (this->*comp)();
@@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 }
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+            }
             else
             {
                 IrregularCycles = false;
@@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
 
-                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
                 }
                 else
                     (this->*comp)();
 
+                Comp_SpecialBranchBehaviour();
+
                 if (CurInstr.Cond() < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipFailed = J();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C(true);
 
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            RegCache.PrepareExit();
+                            SaveCPSR(false);
+                            
+                            MOV(32, R(RAX), Imm32(ConstantCycles));
+                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+                            RET();
+                        }
+
                         SetJumpTarget(skipFailed);
                     }
                     else
@@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
+    /*FILE* codeout = fopen("codeout", "a");
+    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
+    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
+
+    fclose(codeout);*/
+
     return res;
 }
 
@@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
 }
 
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
+
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if (!Thumb && CurInstr.Cond() < 0xE)
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index fcb2380..792ff66 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,6 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
+#include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
 namespace ARMJIT
@@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+struct ComplexOperand
+{
+    ComplexOperand()
+    {}
+
+    ComplexOperand(u32 imm)
+        : IsImm(true), Imm(imm)
+    {}
+    ComplexOperand(int reg, int op, int amount)
+        : IsImm(false)
+    {
+        Reg.Reg = reg;
+        Reg.Op = op;
+        Reg.Amount = amount;
+    }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            int Reg, Op, Amount;
+        } Reg;
+        u32 Imm;
+    };
+};
 
 class Compiler : public Gen::XEmitter
 {
@@ -24,7 +51,7 @@ public:
 
     void Reset();
 
-    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -39,6 +66,8 @@ public:
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
     void Comp_AddCycles_CI(Gen::X64Reg i, int add);
+    void Comp_AddCycles_CDI();
+    void Comp_AddCycles_CD();
 
     enum
     {
@@ -92,8 +121,17 @@ public:
     void T_Comp_BL_LONG_2();
     void T_Comp_BL_Merged();
 
-    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -105,8 +143,9 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void Comp_SpecialBranchBehaviour();
+
     void* Gen_MemoryRoutine9(bool store, int size);
-    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
     void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
@@ -117,10 +156,9 @@ public:
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
-    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
-    void SaveCPSR();
+    void SaveCPSR(bool flagClean = true);
 
     bool FlagsNZRequired()
     { return CurInstr.SetFlags & 0xC; }
@@ -139,10 +177,11 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool Exit;
     bool IrregularCycles;
 
     void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2][2];
+    void* MemoryFuncs7[3][2];
 
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index bf8280d..13ca415 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -27,51 +27,7 @@ int squeezePointer(T* ptr)
 /*
     address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-    code cycles - ABI_PARAM3
 */
-
-#define CALC_CYCLES_9(numC, numD, scratch) \
-    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
-    CMP(32, R(numC), R(numD)); \
-    CMOVcc(32, numD, R(numC), CC_G); \
-    CMP(32, R(numD), R(scratch)); \
-    CMOVcc(32, scratch, R(numD), CC_G); \
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
-#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        LEA(32, scratch, MRegSum(numD, numC)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        if (!store) \
-            ADD(32, R(numC), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        if (!store) \
-            ADD(32, R(numD), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
     FixupBranch insideITCM = J_CC(CC_B);
 
-    // cycle counting!
-    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
-    SHR(32, R(ABI_PARAM4), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
-    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-
     if (store)
     {
         if (size > 8)
@@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
     {
         MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+        
+        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
+        static_assert(sizeof(AddressRange) == 16);
+        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        JMP((u8*)InvalidateByAddr, true);
+        SetJumpTarget(noCode);
     }
     else
     {
@@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
-{
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
-    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
-    if (store)
-    {
-        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
-    }
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM7Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM7Read8, true);
-    }
-
-    return res;
-}
-
 #define MEMORY_SEQ_WHILE_COND \
         if (!store) \
             MOV(32, currentElement, R(EAX));\
@@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     ABI_PARAM1 address
     ABI_PARAM2 address where registers are stored
     ABI_PARAM3 how many values to read/write
-    ABI_PARAM4 code cycles
 
     Dolphin x64CodeEmitter is my favourite assembler
  */
 void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
-
-    FixupBranch finishIt1 = J();
+    RET();
 
     SetJumpTarget(insideDTCM);
     AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
@@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
-    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
-    FixupBranch finishIt2 = J();
+    RET();
 
     SetJumpTarget(insideITCM);
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
@@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     {
         MOV(32, R(ABI_PARAM4), currentElement);
         MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+
+        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
+        CALL((u8*)InvalidateByAddr);
+        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        SetJumpTarget(noCode);
     }
     else
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1));
-    MOV(32, R(ABI_PARAM2), Imm32(1));
-
-    SetJumpTarget(finishIt1);
-    SetJumpTarget(finishIt2);
-
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
-
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
-
-    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
     RET();
 
     return res;
@@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
 void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+    RET();
 
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
+    return res;
+}
 
-    // TODO: optimise this
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
+#undef MEMORY_SEQ_WHILE_COND
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+        CurCPU->DataRead16(addr & ~0x1, &val);
+    else
+        CurCPU->DataRead8(addr, &val);
+    CurCPU->R[15] = tmpR15;
 
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+    MOV(32, MapReg(rd), Imm32(val));
 
-    return res;
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+
+    Comp_AddCycles_CDI();
 }
 
-#undef CALC_CYCLES_9
-#undef MEMORY_SEQ_WHILE_COND
+void fault(u32 a, u32 b)
+{
+    printf("actually not static! %x %x\n", a, b);
+}
 
-void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
+void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    IrregularCycles = true;
+    if (flags & memop_Store)
+    {
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-    if (store)
-        MOV(32, R(ABI_PARAM2), rd);
-    u32 cycles = Num
-        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-    MOV(32, R(ABI_PARAM3), Imm32(cycles));
-    CALL(Num == 0
-        ? MemoryFuncs9[size >> 4][store]
-        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
 
-    if (!store)
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
     {
-        if (signExtend)
-            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        Comp_MemLoadLiteral(size, rd, 
+            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+    }
+    else
+    {
+        OpArg rdMapped = MapReg(rd);
+        OpArg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memoryFunc = Num == 0
+            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
+            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
+
+        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
+            MOV(32, R(ABI_PARAM1), Imm32(R15));
+            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            CMP(32, R(RSCRATCH), Imm32(addr));
+            FixupBranch eq = J_CC(CC_E);
+            CALL((void*)fault);
+            SetJumpTarget(eq);*/
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                if (flags & memop_Store)
+                {
+                    MOV(size, M(ptr), MapReg(rd));
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+
+                    if (size == 32 && addr & ~0x3)
+                    {
+                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                    }
+                }
+
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memoryFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        X64Reg finalAddr = ABI_PARAM1;
+        if (flags & memop_Post)
+        {
+            MOV(32, R(ABI_PARAM1), rnMapped);
+
+            finalAddr = rnMapped.GetSimpleReg();
+        }
+
+        if (op2.IsImm)
+        {
+            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+        }
         else
-            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        {
+            OpArg rm = MapReg(op2.Reg.Reg);
+
+            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            {
+                LEA(32, finalAddr, 
+                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+            }
+            else
+            {
+                bool throwAway;
+                OpArg offset =
+                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+                
+                if (flags & memop_SubtractOffset)
+                {
+                    MOV(32, R(finalAddr), rnMapped);
+                    if (!offset.IsZero())
+                        SUB(32, R(finalAddr), offset);
+                }
+                else
+                    MOV_sum(32, finalAddr, rnMapped, offset);
+            }
+        }
+
+        if ((flags & memop_Writeback) && !(flags & memop_Post))
+            MOV(32, rnMapped, R(finalAddr));
+
+        if (flags & memop_Store)
+            MOV(32, R(ABI_PARAM2), rdMapped);
+
+        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
+            MOV(32, rdMapped, R(ABI_PARAM1));
+
+        if (inlinePreparation && size > 8)
+            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+
+        CALL(memoryFunc);
+
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    static_assert(RSCRATCH3 == ECX);
+                    MOV(32, R(ECX), rdMapped);
+                    AND(32, R(ECX), Imm8(3));
+                    SHL(32, R(ECX), Imm8(3));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else if (constLocalROR32 != 0)
+                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
+            }
+
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+
+        if (!(flags & memop_Store) && rd == 15)
+        {
+            if (size < 32)
+                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            {
+                if (Num == 1)
+                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
+                Comp_JumpTo(rdMapped.GetSimpleReg());
+            }
+        }
     }
 }
 
@@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    u32 cycles = Num
-            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-
     // we need to make sure that the stack stays aligned to 16 bytes
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 
-    MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        Comp_AddCycles_CDI();
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        Comp_AddCycles_CD();
+
         if (regsCount & 1)
             PUSH(RSCRATCH);
 
@@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     return offset;
 }
 
-OpArg Compiler::A_Comp_GetMemWBOffset()
-{
-    if (!(CurInstr.Instr & (1 << 25)))
-    {
-        u32 imm = CurInstr.Instr & 0xFFF;
-        return Imm32(imm);
-    }
-    else
-    {
-        int op = (CurInstr.Instr >> 5) & 0x3;
-        int amount = (CurInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurInstr.A_Reg(0));
-        bool carryUsed;
-
-        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
-    }
-}
 
 void Compiler::A_Comp_MemWB()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
     bool load = CurInstr.Instr & (1 << 20);
     bool byte = CurInstr.Instr & (1 << 22);
     int size = byte ? 8 : 32;
+    
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
 
-    if (CurInstr.Instr & (1 << 24))
+    ComplexOperand offset;
+    if (!(CurInstr.Instr & (1 << 25)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
+        offset = ComplexOperand(CurInstr.Instr & 0xFFF);
     }
     else
-        MOV(32, R(ABI_PARAM1), rn);
-
-    if (!(CurInstr.Instr & (1 << 24)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        int rm = CurInstr.A_Reg(0);
 
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
+        offset = ComplexOperand(rm, op, amount);
     }
 
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
-    if (load && CurInstr.A_Reg(12) == 15)
-    {
-        if (byte)
-            printf("!!! LDRB PC %08X\n", R15);
-        else
-        {
-            if (Num == 1)
-                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
-            Comp_JumpTo(rd.GetSimpleReg());
-        }
-    }
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::A_Comp_MemHalf()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
-
-    OpArg offset = CurInstr.Instr & (1 << 22)
-        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
-        : MapReg(CurInstr.A_Reg(0));
+    ComplexOperand offset = CurInstr.Instr & (1 << 22)
+        ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : ComplexOperand(CurInstr.A_Reg(0), 0, 0);
 
     int op = (CurInstr.Instr >> 5) & 0x3;
     bool load = CurInstr.Instr & (1 << 20);
@@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf()
     if (size == 32 && Num == 1)
         return; // NOP
 
-    if (CurInstr.Instr & (1 << 24))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-        
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
-    }
-    else
-        MOV(32, R(ABI_PARAM1), rn);
-
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
     if (!(CurInstr.Instr & (1 << 24)))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
-    }
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
-
-    if (load && CurInstr.A_Reg(12) == 15)
-        printf("!!! MemHalf op PC %08X\n", R15);;
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
     bool byte = op & 0x1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::A_Comp_LDM_STM()
@@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM()
 
 void Compiler::T_Comp_MemImm()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
     bool byte = op & 0x2;
     u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_MemRegHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op != 0;
     int size = op != 1 ? 16 : 8;
     bool signExtend = op & 1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
+        size, flags);
 }
 
 void Compiler::T_Comp_MemImmHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     u32 offset = (CurInstr.Instr >> 5) & 0x3E;
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 16);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
 
-    // hopefully this doesn't break
-    u32 val; CurCPU->DataRead32(addr, &val);
-    MOV(32, rd, Imm32(val));
+    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
 }
 
 void Compiler::T_Comp_MemSPRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) * 4;
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 32);
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_PUSH_POP()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 9239e29..0fbde26 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -36,7 +36,7 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMemory       = 1 << 20,
+    A_WriteMem          = 1 << 20
 };
 
 #define A_BIOP A_Read16
@@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(
 const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
 const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
 const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
-const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
@@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12 | A_WriteMemory
+#define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double | A_WriteMemory
+#define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 21)
+#define tk(x) ((x) << 22)
 
 enum {
     T_Read0         = 1 << 0,
@@ -210,6 +210,8 @@ enum {
     T_SetMaybeC     = 1 << 18,
     T_ReadC         = 1 << 19,
     T_SetC          = 1 << 20,
+    
+    T_WriteMem      = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
-const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
-const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
 const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
 const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
-const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
 const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
 const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
 const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
 
-const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
 const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
-const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
 const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
-const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
 const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 
-const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
-const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
@@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 21) & 0x3F;
+        res.Kind = (data >> 22) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_SetC)
             res.WriteFlags |= flag_C;
 
+        if (data & T_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
             if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
+
+            if (id == 0x704 || id == 0x782)
+                res.SpecialKind = special_WaitForInterrupt;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
         {
@@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
             res.WriteFlags |= flag_C;
 
+        if (data & A_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d01c600..d02f168 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -226,18 +226,27 @@ enum
     flag_V = 1 << 0,
 };
 
+enum
+{
+    special_NotSpecialAtAll = 0,
+    special_WriteMem,
+    special_WaitForInterrupt
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 SpecialKind;
+
     u8 ReadFlags;
     // lower 4 bits - set always
     // upper 4 bits - might set flag
     u8 WriteFlags;
 
     bool EndBlock;
-    bool Branches()
+    bool Branches() const
     {
         return DstRegs & (1 << 15);
     }
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 5b5f935..8a9b31d 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -562,9 +562,11 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+        ARMJIT::InvalidateAll();
         ICacheInvalidateAll();
         return;
     case 0x751:
+        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -814,7 +816,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -838,7 +840,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -862,8 +864,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -887,8 +888,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
diff --git a/src/Config.cpp b/src/Config.cpp
index 33bab75..c117a41 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -40,6 +40,7 @@ char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+bool JIT_BrancheOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -56,6 +57,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 9296335..c9013aa 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -54,6 +54,7 @@ extern char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+extern bool JIT_BrancheOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 0bde139..0cfbd1a 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -575,7 +575,7 @@ void Reset()
     RCnt = 0;
 
 #ifdef JIT_ENABLED
-    ARMJIT::InvalidateBlockCache();
+    ARMJIT::ResetBlockCache();
 #endif
 
     NDSCart::Reset();
@@ -807,7 +807,7 @@ bool DoSavestate(Savestate* file)
 #ifdef JIT_ENABLED
     if (!file->Saving)
     {
-        ARMJIT::InvalidateBlockCache();
+        ARMJIT::ResetBlockCache();
     }
 #endif
 
@@ -2016,10 +2016,6 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2070,10 +2066,6 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2140,10 +2132,6 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2439,7 +2427,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2502,7 +2490,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2575,7 +2563,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
-- 
cgit v1.2.3


From 52dd0ee75a3dd78bc0ef8638c8ff16c1e9abdd36 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:14:33 +0200
Subject: remove leftover debug code

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 25c55a3..a994d34 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -598,8 +598,6 @@ void Compiler::Comp_AddCycles_CDI()
             cycles = numC + numD + 1;
         }
         
-        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
-
         if (!Thumb && CurInstr.Cond() < 0xE)
             ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
-- 
cgit v1.2.3


From 9cf7780e4641abaf07b6c453dfa182a80516c190 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Wed, 16 Oct 2019 23:39:12 +0200
Subject: decrease jit block cache address granularity fixes Dragon Quest IX
 move code with side effects out of assert, fixes release build (thanks to
 m4wx for this one) also remove some leftovers of jit pipelining

---
 src/ARMJIT.cpp                      | 42 ++++++++++++++++++++++---------------
 src/ARMJIT_Internal.h               |  3 +--
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 31 ++++++++++++++-------------
 src/ARM_InstrInfo.cpp               | 25 ++++++++++++++--------
 src/ARM_InstrInfo.h                 |  3 ++-
 5 files changed, 60 insertions(+), 44 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 686bdd6..19a5e70 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -106,7 +106,7 @@ u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
 JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
-AddressRange CodeRanges[ExeMemSpaceSize / 256];
+AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 TinyVector<JitBlock*> JitBlocks;
 JitBlock* RestoreCandidates[0x1000] = {NULL};
@@ -285,6 +285,13 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
 #undef F_MEM_HD
 #undef F
 
+void T_BL_LONG(ARM* cpu)
+{
+	ARMInterpreter::T_BL_LONG_1(cpu);
+	cpu->R[15] += 2;
+	ARMInterpreter::T_BL_LONG_2(cpu);
+}
+
 #define F(x) ARMInterpreter::T_##x
 InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 {
@@ -302,7 +309,7 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 	F(PUSH), F(POP), F(LDMIA), F(STMIA),
 	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
 	F(UNK), F(SVC), 
-	NULL // BL_LONG psudo opcode
+	T_BL_LONG // BL_LONG psudo opcode
 };
 #undef F
 
@@ -341,7 +348,7 @@ void CompileBlock(ARM* cpu)
 	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
 		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
 		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
-		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
 
@@ -352,7 +359,7 @@ void CompileBlock(ARM* cpu)
 		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
-        instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
+        nextInstr[0] = nextInstr[1];
 	
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
@@ -361,7 +368,7 @@ void CompileBlock(ARM* cpu)
 
 		u32 translatedAddr = (cpu->Num == 0
 			? TranslateAddr<0>(instrs[i].Addr)
-			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF;
 		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
 		{
 			bool returning = false;
@@ -400,7 +407,6 @@ void CompileBlock(ARM* cpu)
                 nextInstr[1] = cpuv4->CodeRead32(r15);
             instrs[i].CodeCycles = cpu->CodeCycles;
         }
-        instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
 		cpu->R[15] = r15;
@@ -584,7 +590,7 @@ void CompileBlock(ARM* cpu)
 	for (int j = 0; j < numAddressRanges; j++)
 	{
 		assert(addresseRanges[j] == block->AddressRanges()[j]);
-		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
 	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
@@ -595,7 +601,7 @@ void CompileBlock(ARM* cpu)
 void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
-	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
 	int startLength = range->Blocks.Length;
 	for (int i = 0; i < range->Blocks.Length; i++)
 	{
@@ -604,15 +610,17 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			if ((addr / 256) != (pseudoPhysical / 256))
+			if ((addr / 512) != (pseudoPhysical / 512))
 			{
-				AddressRange* otherRange = &CodeRanges[addr / 256];
+				AddressRange* otherRange = &CodeRanges[addr / 512];
 				assert(otherRange != range);
-				assert(otherRange->Blocks.RemoveByValue(block));
+				bool removed = otherRange->Blocks.RemoveByValue(block);
+				assert(removed);
 			}
 		}
 
-		assert(JitBlocks.RemoveByValue(block));
+		bool removed = JitBlocks.RemoveByValue(block);
+		assert(removed);
 
 		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
@@ -631,14 +639,14 @@ void InvalidateByAddr(u32 pseudoPhysical)
 void InvalidateByAddr7(u32 addr)
 {
 	u32 pseudoPhysical = TranslateAddr<1>(addr);
-	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false))
 		InvalidateByAddr(pseudoPhysical);
 }
 
 void InvalidateITCM(u32 addr)
 {
 	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
-	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+	if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0)
 		InvalidateByAddr(pseudoPhysical);
 }
 
@@ -654,7 +662,7 @@ void InvalidateAll()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeRanges[addr / 256];
+			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
@@ -689,8 +697,8 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 256].Blocks.Clear();
-			CodeRanges[addr / 256].TimesInvalidated = 0;
+			CodeRanges[addr / 512].Blocks.Clear();
+			CodeRanges[addr / 512].TimesInvalidated = 0;
 		}
 		delete block;
 	}
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 4acb488..9e6713d 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -38,7 +38,6 @@ struct FetchedInstr
 	u8 BranchFlags;
 	u8 SetFlags;
     u32 Instr;
-    u32 NextInstr[2];
 	u32 Addr;
 
     u8 CodeCycles;
@@ -185,7 +184,7 @@ struct __attribute__((packed)) AddressRange
 	u16 TimesInvalidated;
 };
 
-extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 13ca415..eb01c87 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -105,7 +105,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
         static_assert(sizeof(AddressRange) == 16);
         LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
         MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-        SHR(32, R(RSCRATCH), Imm8(8));
+        SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
         CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
@@ -203,7 +203,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
         ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
         MOV(32, R(ABI_PARAM4), R(RSCRATCH));
-        SHR(32, R(RSCRATCH), Imm8(8));
+        SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
         CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
@@ -284,28 +284,29 @@ void fault(u32 a, u32 b)
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    if (flags & memop_Store)
-    {
-        Comp_AddCycles_CD();
-    }
-    else
-    {
-        Comp_AddCycles_CDI();
-    }
-
     u32 addressMask = ~0;
     if (size == 32)
         addressMask = ~3;
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
-        Comp_MemLoadLiteral(size, rd, 
-            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+        u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        Comp_MemLoadLiteral(size, rd, addr);
+        return;
     }
-    else
+
     {
+        if (flags & memop_Store)
+        {
+            Comp_AddCycles_CD();
+        }
+        else
+        {
+            Comp_AddCycles_CDI();
+        }
+
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
 
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 0fbde26..1261bbe 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -5,7 +5,7 @@
 namespace ARMInstrInfo
 {
 
-#define ak(x) ((x) << 21)
+#define ak(x) ((x) << 22)
 
 enum {
     A_Read0             = 1 << 0,
@@ -36,7 +36,8 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMem          = 1 << 20
+    A_WriteMem          = 1 << 20,
+    A_LoadMem           = 1 << 21
 };
 
 #define A_BIOP A_Read16
@@ -122,7 +123,7 @@ const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB);
 const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
-#define A_LDR A_Write12
+#define A_LDR A_Write12 | A_LoadMem
 #define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
@@ -143,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(STRB,STR)
 A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
-#define A_LDRD A_Write12Double
+#define A_LDRD A_Write12Double | A_LoadMem
 #define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
@@ -159,10 +160,10 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWPB);
 
-const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
+const u32 A_LDM = A_Read16 | A_MemWriteback | A_LoadMem | ak(ak_LDM);
 const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
@@ -360,6 +361,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if (data & T_WriteMem)
             res.SpecialKind = special_WriteMem;
+        
+        if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+            res.SpecialKind = special_LoadLiteral;
 
         res.EndBlock |= res.Branches();
 
@@ -377,7 +381,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
 
-        res.Kind = (data >> 21) & 0x1FF;
+        res.Kind = (data >> 22) & 0x1FF;
 
         if (res.Kind == ak_MCR)
         {
@@ -454,12 +458,15 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.ReadFlags |= flag_C;
         if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
             res.ReadFlags |= flag_C;
-        if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
+        if ((data & A_SetC) || ((data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)))
             res.WriteFlags |= flag_C;
 
         if (data & A_WriteMem)
             res.SpecialKind = special_WriteMem;
 
+        if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
+            res.SpecialKind = special_LoadLiteral;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d02f168..c032a4f 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -230,7 +230,8 @@ enum
 {
     special_NotSpecialAtAll = 0,
     special_WriteMem,
-    special_WaitForInterrupt
+    special_WaitForInterrupt,
+    special_LoadLiteral
 };
 
 struct Info
-- 
cgit v1.2.3


From 441869a10567c2da3de210052cbe93d783a9ce83 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 13:29:17 +0200
Subject: integrate changes from ARM64 backend and more - better handle LDM/STM
 in reg alloc - unify Halted and IRQ in anticipation for branch inlining -
 literal optimisations can be disabled in gui - jit blocks follow simple
 returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in
 Pokemon White)

---
 src/ARM.cpp                         | 40 ++++++++++++++++++-----------
 src/ARM.h                           | 13 +++++++---
 src/ARMJIT.cpp                      | 50 +++++++++++++++++++++++++++++++------
 src/ARMJIT_RegisterCache.h          | 33 +++++++++++++++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  7 +++---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++----
 src/ARM_InstrInfo.cpp               | 28 +++++++++++++++++++++
 src/ARM_InstrInfo.h                 |  2 +-
 src/Config.cpp                      |  2 ++
 src/Config.h                        |  1 +
 src/NDS.cpp                         |  4 +--
 11 files changed, 153 insertions(+), 43 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 1e75301..2f4aa90 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -159,7 +159,7 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&Halted);
+    file->Var32(&StopExecution);
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
@@ -632,16 +632,21 @@ void ARMv5::ExecuteJIT()
         NDS::ARM9Timestamp += Cycles;
         Cycles = 0;
 
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM9Timestamp = NDS::ARM9Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+                {
+                    NDS::ARM9Timestamp = NDS::ARM9Target;
+                }
+                break;
             }
-            break;
         }
     }
 
@@ -769,16 +774,21 @@ void ARMv4::ExecuteJIT()
         Cycles = 0;
 
         // TODO optimize this shit!!!
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM7Timestamp = NDS::ARM7Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+                {
+                    NDS::ARM7Timestamp = NDS::ARM7Target;
+                }
+                break;
             }
-            break;
         }
     }
 
diff --git a/src/ARM.h b/src/ARM.h
index b36120a..96dd857 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -112,9 +112,16 @@ public:
     u32 Num;
 
     s32 Cycles;
-    u32 Halted;
-
-    u32 IRQ; // nonzero to trigger IRQ
+    union
+    {
+        struct
+        {
+            u8 Halted;
+            u8 IRQ; // nonzero to trigger IRQ
+            u8 IdleLoop;
+        };
+        u32 StopExecution;
+    };
 
     u32 CodeRegion;
     s32 CodeCycles;
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 19a5e70..0695b85 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -16,11 +16,13 @@
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
+#include "NDSCart.h"
 
 namespace ARMJIT
 {
 
 #define JIT_DEBUGPRINT(msg, ...)
+//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
 Compiler* compiler;
 
@@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
+	u32& linkAddr, u32& targetAddr)
 {
 	if (thumb)
 	{
 		u32 r15 = instr.Addr + 4;
 		cond = 0xE;
 
+		link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
+		linkAddr = instr.Addr + 4;
+
 		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
 		{
 			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
@@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	else
 	{
+		link = instr.Info.Kind == ARMInstrInfo::ak_BL;
+		linkAddr = instr.Addr + 4;
+
 		cond = instr.Cond();
 		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
 			|| instr.Info.Kind == ARMInstrInfo::ak_B)
@@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	return false;
 }
@@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
+	u32 lr;
+	bool hasLink = false;
 
     do
     {
@@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
+		if (instrs[i].Info.DstRegs & (1 << 14))
+			hasLink = false;
+
 		if (thumb)
 		{
 			InterpretTHUMB[instrs[i].Info.Kind](cpu);
@@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
-			u32 cond, target;
-			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			bool link;
+			u32 cond, target, linkAddr;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
 			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
 
 			if (staticBranch)
@@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu)
 				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
 				{
 					// we might have an idle loop
-					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
-					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
+					if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
 					{
 						instrs[i].BranchFlags |= branch_IdleBranch;
 						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
 					}
 				}
-				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
 						? TranslateAddr<0>(target)
 						: TranslateAddr<1>(target);
+
+					if (link)
+					{
+						lr = linkAddr;
+						hasLink = true;
+					}
 					
 					r15 = target + (thumb ? 2 : 4);
 					assert(r15 == cpu->R[15]);
@@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu)
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
 	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
 	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
@@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	{
 		if ((addr & 0xFF000000) == 0x04000000)
 		{
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
 			/*
 				unfortunately we can't map GPU2D this way
 				since it's hidden inside an object
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index ed6a2b7..2222bc2 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -93,10 +93,12 @@ public:
 
 	void Prepare(bool thumb, int i)
     {
+        FetchedInstr instr = Instrs[i];
+
         if (LoadedRegs & (1 << 15))
             UnloadRegister(15);
 
-        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
         for (int reg : invalidedLiterals)
             UnloadLiteral(reg);
 
@@ -108,6 +110,7 @@ public:
         {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
+            regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
             for (int reg : regsNeeded)
                 ranking[reg]++;
         }
@@ -117,8 +120,8 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -143,13 +146,31 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            // we don't need to load a value which is always going to be overwritten
             BitSet16 needValueLoaded(needToBeLoaded);
-            if (thumb || Instr.Cond() >= 0xE)
-                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
+            if (thumb || instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
+        } 
+        {
+            BitSet16 loadedSet(LoadedRegs);
+            BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
+            if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
+            {
+                int left = NativeRegsAvailable - loadedSet.Count();
+                for (int reg : loadRegs)
+                {
+                    if (left-- == 0)
+                        break;
+
+                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
+                    LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
+                }
+            }
         }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+
+        DirtyRegs |= writeRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index a994d34..fd38724 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -364,7 +364,7 @@ void Compiler::Reset()
 void Compiler::Comp_SpecialBranchBehaviour()
 {
     if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
     if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
     {
@@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     {
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
 
         Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
 
@@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
-        IrregularCycles = true;
-
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
@@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD()
         IrregularCycles = true;
     }
 
-    if (!Thumb && CurInstr.Cond() < 0xE)
+    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index eb01c87..3799774 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,6 @@
 #include "ARMJIT_Compiler.h"
 
+#include "../Config.h"
 
 using namespace Gen;
 
@@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         Comp_MemLoadLiteral(size, rd, addr);
@@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
+        if (Thumb && rn == 15)
+            rnMapped = Imm32(R15 & ~0x2);
 
         bool inlinePreparation = Num == 1;
         u32 constLocalROR32 = 4;
@@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
             : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
         {
             u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
@@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
-
-    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    u32 addr = (R15 & ~0x2) + offset;
+    if (Config::JIT_LiteralOptimisations)
+        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    else
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 1261bbe..8f8bd35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
             res.SpecialKind = special_LoadLiteral;
 
+        if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            res.NotStrictlyNeeded |= set;
+            res.DstRegs |= set;
+        }
+        if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            if (res.Kind == tk_PUSH && instr & (1 << 8))
+                set |= (1 << 14);
+            res.NotStrictlyNeeded |= set;
+            res.SrcRegs |= set;
+        }
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
             res.SpecialKind = special_LoadLiteral;
+        
+        if (res.Kind == ak_LDM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.DstRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
+        if (res.Kind == ak_STM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.SrcRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
 
         if ((instr >> 28) < 0xE)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index c032a4f..2732181 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -236,7 +236,7 @@ enum
 
 struct Info
 {
-    u16 DstRegs, SrcRegs;
+    u16 DstRegs, SrcRegs, NotStrictlyNeeded;
     u16 Kind;
 
     u8 SpecialKind;
diff --git a/src/Config.cpp b/src/Config.cpp
index c117a41..a7d78cd 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -41,6 +41,7 @@ char DSiNANDPath[1024];
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
 bool JIT_BrancheOptimisations = true;
+bool JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -58,6 +59,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index c9013aa..1fcd9bb 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -55,6 +55,7 @@ extern char DSiNANDPath[1024];
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern bool JIT_BrancheOptimisations;
+extern bool JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 0cfbd1a..7b6a450 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1211,9 +1211,9 @@ void UpdateIRQ(u32 cpu)
 
     if (IME[cpu] & 0x1)
     {
-        arm->IRQ = IE[cpu] & IF[cpu];
+        arm->IRQ = !!(IE[cpu] & IF[cpu]);
         if ((ConsoleType == 1) && cpu)
-            arm->IRQ |= (IE2 & IF2);
+            arm->IRQ |= !!(IE2 & IF2);
     }
     else
     {
-- 
cgit v1.2.3


From d1d96d2236b705a6c7c0b68d56bc4c8c1e72ec42 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 18:03:31 +0200
Subject: fix config key for jit literal optimisations

---
 src/Config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/Config.cpp b/src/Config.cpp
index a7d78cd..07b1e3e 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -59,7 +59,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
+    {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
-- 
cgit v1.2.3


From 3e7483636f69f18da0efabc10686ed4ab04c6b86 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 3 Nov 2019 15:33:20 +0100
Subject: make literal optimisation more reliable fixes spanish Pokemon
 HeartGold

---
 src/ARMJIT.cpp                      | 52 +++++++++++++++++++++++++++++++++----
 src/ARMJIT.h                        |  2 +-
 src/ARMJIT_Internal.h               |  3 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 34 +++++++++++++++++++-----
 4 files changed, 77 insertions(+), 14 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 0695b85..c7387c9 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -161,6 +161,27 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
+bool DecodeLiteral(const FetchedInstr& instr, u32& addr)
+{
+	switch (instr.Info.Kind)
+	{
+	case ARMInstrInfo::ak_STR_IMM:
+	case ARMInstrInfo::ak_STRB_IMM:
+		addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
+		return true;
+	case ARMInstrInfo::ak_STRD_IMM:
+	case ARMInstrInfo::ak_STRH_IMM:
+		addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
+		return true;
+	case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever}
+		addr = instr.Addr + 8;
+		return true;
+	default:
+		JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr);
+		return false;
+	}
+}
+
 bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
 	u32& linkAddr, u32& targetAddr)
 {
@@ -463,6 +484,23 @@ void CompileBlock(ARM* cpu)
 		instrs[i].DataCycles = cpu->DataCycles;
 		instrs[i].DataRegion = cpu->DataRegion;
 
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem 
+			&& instrs[i].Info.SrcRegs == (1 << 15)
+			&& instrs[i].Info.DstRegs == 0)
+		{
+			assert (!thumb);
+
+			u32 addr;
+			if (DecodeLiteral(instrs[i], addr))
+			{
+				JIT_DEBUGPRINT("pc relative write detected\n");
+				u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+				ARMJIT::InvalidateByAddr(translatedAddr, false);
+				CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16));
+			}
+		}
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -631,7 +669,7 @@ void CompileBlock(ARM* cpu)
 	JitBlocks.Add(block);
 }
 
-void InvalidateByAddr(u32 pseudoPhysical)
+void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
@@ -657,11 +695,14 @@ void InvalidateByAddr(u32 pseudoPhysical)
 
 		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
+		if (mayRestore)
+		{
+			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+				delete RestoreCandidates[slot];
 
-		RestoreCandidates[slot] = block;
+			RestoreCandidates[slot] = block;
+		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
 		range->TimesInvalidated++;
@@ -732,6 +773,7 @@ void ResetBlockCache()
 			u32 addr = block->AddressRanges()[j];
 			CodeRanges[addr / 512].Blocks.Clear();
 			CodeRanges[addr / 512].TimesInvalidated = 0;
+			CodeRanges[addr / 512].InvalidLiterals = 0;
 		}
 		delete block;
 	}
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 1db4d66..09cc463 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -61,7 +61,7 @@ inline JitBlockEntry LookUpBlock(u32 addr)
 void Init();
 void DeInit();
 
-void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true);
 void InvalidateAll();
 
 void InvalidateITCM(u32 addr);
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 9e6713d..fb05f75 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -63,7 +63,7 @@ struct __attribute__((packed)) TinyVector
 {
 	T* Data = NULL;
 	u16 Capacity = 0;
-	u32 Length = 0; // make it 32 bit so we don't need movzx
+	u16 Length = 0;
 
 	~TinyVector()
 	{
@@ -181,6 +181,7 @@ private:
 struct __attribute__((packed)) AddressRange
 {
 	TinyVector<JitBlock*> Blocks;
+	u16 InvalidLiterals;
 	u16 TimesInvalidated;
 };
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 3799774..82f80a7 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -108,7 +108,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
         MOV(32, R(RSCRATCH), R(ABI_PARAM1));
         SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
         JMP((u8*)InvalidateByAddr, true);
         SetJumpTarget(noCode);
@@ -206,7 +206,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(ABI_PARAM4), R(RSCRATCH));
         SHR(32, R(RSCRATCH), Imm8(9));
         SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
         FixupBranch noCode = J_CC(CC_Z);
         ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
         MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
@@ -278,10 +278,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
     Comp_AddCycles_CDI();
 }
 
-void fault(u32 a, u32 b)
+/*void fault(u32 a, u32 b, u32 c, u32 d)
 {
-    printf("actually not static! %x %x\n", a, b);
-}
+    printf("actually not static! %x %x %x %x\n", a, b, c, d);
+}*/
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
@@ -291,11 +291,17 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
+    //bool check = false;
     if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        Comp_MemLoadLiteral(size, rd, addr);
-        return;
+        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
+        {
+            Comp_MemLoadLiteral(size, rd, addr);
+            return;
+        }
     }
 
     {
@@ -438,6 +444,20 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         CALL(memoryFunc);
 
+        /*if (Num == 0 && check)
+        {
+            CMP(32, R(EAX), rdMapped);
+            FixupBranch notEqual = J_CC(CC_E);
+            ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0);
+            MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8)));
+            MOV(32, R(ABI_PARAM2), R(EAX));
+            MOV(32, R(ABI_PARAM3), rdMapped);
+            MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr));
+            CALL((u8*)fault);
+            ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0);
+            SetJumpTarget(notEqual);
+        }*/
+
         if (!(flags & memop_Store))
         {
             if (inlinePreparation && size == 32)
-- 
cgit v1.2.3


From 1cfbbcbb2af09c7f56ca3f6303b0ce8a36cd7146 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 5 Nov 2019 18:50:17 +0100
Subject: make savestates 100% compatible again

---
 src/ARM.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 2f4aa90..896bb5c 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -159,7 +159,11 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&StopExecution);
+
+    // hack to make save states compatible
+    u32 halted = Halted;
+    file->Var32(&halted);
+    Halted = halted;
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
-- 
cgit v1.2.3


From 000c03c9d6307faa7b52988da1510cc4d0dcd8a3 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 6 Dec 2019 22:16:23 +0100
Subject: disable literal optimations in DTCM

---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 82f80a7..b66f304 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -347,8 +347,10 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 // stupid dtcm...
                 if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
                 {
-                    region.Mem = cpu5->DTCM;
-                    region.Mask = 0x3FFF;
+                    // disable this for now as DTCM is located in heap
+                    // which might excced the RIP-addressable range
+                    //region.Mem = cpu5->DTCM;
+                    //region.Mask = 0x3FFF;
                 }
                 else
                 {
-- 
cgit v1.2.3


From ec965c6014df2eb252d9da498684e94fe41fece4 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 17:28:51 +0100
Subject: improve nop handling and proper behaviour for LDM^ fixes dslinux

---
 src/ARM.cpp                         |  2 ++
 src/ARMJIT.cpp                      | 13 +++++++++----
 src/ARMJIT_RegisterCache.h          |  2 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  6 +++---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  1 +
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 ++
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  5 +++--
 src/ARM_InstrInfo.cpp               |  2 ++
 src/ARM_InstrInfo.h                 |  2 ++
 9 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 896bb5c..fc0b898 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -768,6 +768,8 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
+        //printf("executing armv4 at %08x\n", instrAddr);
+
         ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
         if (block)
             Cycles += block();
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index c7387c9..8fd7708 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -273,6 +273,8 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 
+void NOP(ARM* cpu) {}
+
 #define F(x) &ARMInterpreter::A_##x
 #define F_ALU(name, s) \
 	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
@@ -320,7 +322,8 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
 	F(LDM), F(STM),
 
 	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
-	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC),
+	NOP
 };
 #undef F_ALU
 #undef F_MEM_WB
@@ -387,8 +390,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
-		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
 		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
@@ -473,7 +476,9 @@ void CompileBlock(ARM* cpu)
 			else
 			{
                 u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
-				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode]
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop);
 				if (cpu->CheckCondition(instrs[i].Cond()))
 					InterpretARM[instrs[i].Info.Kind](cpu);
 				else
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 2222bc2..b894657 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -152,7 +152,7 @@ public:
                 needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
-        } 
+        }
         {
             BitSet16 loadedSet(LoadedRegs);
             BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 0dedb3f..e02865d 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -134,7 +134,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
     IrregularCycles = true;
 
-    BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
+    BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
     bool previouslyDirty = CPSRDirty;
     SaveCPSR();
 
@@ -156,12 +156,12 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     if (!restoreCPSR)
         XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
     else
-        MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR));
+        MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste
     if (Num == 0)
         CALL((void*)&ARMv5::JumpTo);
     else
         CALL((void*)&ARMv4::JumpTo);
-    
+
     if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
     {
         for (int reg : hiRegsLoaded)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fd38724..5afe842 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -308,6 +308,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
     // system stuff
     NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    F(Nop)
 };
 
 const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 792ff66..2cb57dc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -79,6 +79,8 @@ public:
         opInvertOp2 = 1 << 5,
     };
 
+    void Nop() {}
+
     void A_Comp_Arith();
     void A_Comp_MovOp();
     void A_Comp_CmpOp();
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b66f304..4cafc1c 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -531,7 +531,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         {
             if (regs[reg])
             {
-                if (usermode && reg >= 8 && reg < 15)
+                if (usermode && !regs[15] && reg >= 8 && reg < 15)
                 {
                     if (firstUserMode)
                     {
@@ -545,7 +545,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
                     if (RegCache.Mapping[reg] != INVALID_REG)
                         MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
-                    SaveReg(reg, ABI_PARAM3);
+                    else
+                        SaveReg(reg, ABI_PARAM3);
                     SetJumpTarget(sucessfulWritten);
                 }
                 else if (RegCache.Mapping[reg] == INVALID_REG)
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 8f8bd35..08e2f0a 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -392,6 +392,8 @@ Info Decode(bool thumb, u32 num, u32 instr)
         u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
         if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
             data = A_BLX_IMM;
+        else if ((instr >> 28) == 0xF)
+            data = ak(ak_Nop);
 
         if (data & A_UnkOnARM7 && num != 0)
             data = A_UNK;
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 2732181..6ab4929 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -139,6 +139,8 @@ enum
     ak_MRC,
     ak_SVC,
 
+    ak_Nop,
+
     ak_Count,
 
     tk_LSL_IMM = 0,
-- 
cgit v1.2.3


From baed0ac0d59d3cbbce01389a84da9774f5613b3b Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 17:38:04 +0100
Subject: remove debug leftovers

---
 src/ARM.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index fc0b898..896bb5c 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -768,8 +768,6 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        //printf("executing armv4 at %08x\n", instrAddr);
-
         ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
         if (block)
             Cycles += block();
-- 
cgit v1.2.3


From 99b34efe2d923c4fd6fbfdb051833d2af6ea2136 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:29:52 +0100
Subject: move ARM64 JIT backend here

---
 CMakeLists.txt                      |    2 +-
 src/ARM.h                           |    9 +-
 src/ARMJIT.cpp                      |    4 +
 src/ARMJIT_A64/ARMJIT_ALU.cpp       |  837 +++++++
 src/ARMJIT_A64/ARMJIT_Branch.cpp    |  452 ++++
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  |  707 ++++++
 src/ARMJIT_A64/ARMJIT_Compiler.h    |  234 ++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp |  848 +++++++
 src/ARM_InstrInfo.cpp               |    7 +-
 src/CMakeLists.txt                  |   27 +-
 src/dolphin/Align.h                 |   24 +
 src/dolphin/Arm64Emitter.cpp        | 4466 +++++++++++++++++++++++++++++++++++
 src/dolphin/Arm64Emitter.h          | 1152 +++++++++
 src/dolphin/ArmCommon.h             |   27 +
 src/dolphin/BitUtils.h              |  254 ++
 src/dolphin/Compat.h                |   12 +
 src/dolphin/MathUtil.cpp            |   13 +
 src/dolphin/MathUtil.h              |  121 +
 18 files changed, 9188 insertions(+), 8 deletions(-)
 create mode 100644 src/ARMJIT_A64/ARMJIT_ALU.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Branch.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.cpp
 create mode 100644 src/ARMJIT_A64/ARMJIT_Compiler.h
 create mode 100644 src/ARMJIT_A64/ARMJIT_LoadStore.cpp
 create mode 100644 src/dolphin/Align.h
 create mode 100644 src/dolphin/Arm64Emitter.cpp
 create mode 100644 src/dolphin/Arm64Emitter.h
 create mode 100644 src/dolphin/ArmCommon.h
 create mode 100644 src/dolphin/BitUtils.h
 create mode 100644 src/dolphin/MathUtil.cpp
 create mode 100644 src/dolphin/MathUtil.h

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e53c60..6729e73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,7 @@ detect_architecture("__i386__" x86)
 detect_architecture("__arm__" ARM)
 detect_architecture("__aarch64__" ARM64)
 
-if (ARCHITECTURE STREQUAL x86_64)
+if (ARCHITECTURE STREQUAL x86_64 OR ARCHITECTURE STREQUAL ARM64)
 	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
 endif()
 
diff --git a/src/ARM.h b/src/ARM.h
index 96dd857..7ef1938 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -254,10 +254,14 @@ public:
 
     u32 DTCMSetting, ITCMSetting;
 
-    u8 ITCM[0x8000];
+    // for aarch64 JIT they need to go up here
+    // to be addressable by a 12-bit immediate
     u32 ITCMSize;
-    u8 DTCM[0x4000];
     u32 DTCMBase, DTCMSize;
+    s32 RegionCodeCycles;
+
+    u8 ITCM[0x8000];
+    u8 DTCM[0x4000];
 
     u8 ICache[0x2000];
     u32 ICacheTags[64*4];
@@ -282,7 +286,6 @@ public:
     // code/16N/32N/32S
     u8 MemTimings[0x100000][4];
 
-    s32 RegionCodeCycles;
     u8* CurICacheLine;
 
     bool (*GetMemRegion)(u32 addr, bool write, NDS::MemRegion* region);
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 8fd7708..561fabb 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -6,7 +6,11 @@
 #include "Config.h"
 
 #include "ARMJIT_Internal.h"
+#if defined(__x86_64__)
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
+#else
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#endif
 
 #include "ARMInterpreter_ALU.h"
 #include "ARMInterpreter_LoadStore.h"
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..0fe6a97
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -0,0 +1,837 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+
+    UBFX(W1, rs, 0, 8);
+
+    if (!S)
+    {
+        if (op == 3)
+            RORV(W0, op2.Reg.Rm, W1);
+        else
+        {
+            CMP(W1, 32);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GE);
+                ASRV(W0, op2.Reg.Rm, W1);
+            }
+            else
+            {
+                if (op == 0)
+                    LSLV(W0, op2.Reg.Rm, W1);
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+    }
+    else
+    {
+        MOV(W0, op2.Reg.Rm);
+        FixupBranch zero = CBZ(W1);
+
+        SUB(W1, W1, 1);
+        if (op == 3)
+        {
+            RORV(W0, op2.Reg.Rm, W1);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        else
+        {
+            CMP(W1, 31);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GT);
+                ASRV(W0, op2.Reg.Rm, W1);
+                BFI(RCPSR, W0, 29, 1);
+            }
+            else
+            {
+                if (op == 0)
+                {
+                    LSLV(W0, op2.Reg.Rm, W1);
+                    UBFX(W1, W0, 31, 1);
+                }
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W1, WZR, op ? W0 : W1, CC_GT);
+                BFI(RCPSR, W1, 29, 1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+
+        MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1));
+        SetJumpTarget(zero);
+    }
+    op2 = Op2(W0, ST_LSL, 0);
+}
+
+void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+    
+    switch (op)
+    {
+    case 0: // LSL
+        if (S && amount)
+        {
+            UBFX(tmp, op2.Reg.Rm, 32 - amount, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSL, amount);
+        return;
+    case 1: // LSR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        if (amount == 0)
+        {
+            op2 = Op2(0);
+            return;
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSR, amount);
+        return;
+    case 2: // ASR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31);
+        return;
+    case 3: // ROR
+        if (amount == 0)
+        {
+            UBFX(tmp, RCPSR, 29, 1);
+            LSL(tmp, tmp, 31);
+            if (S)
+                BFI(RCPSR, op2.Reg.Rm, 29, 1);
+            ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1));
+
+            op2 = Op2(tmp, ST_LSL, 0);
+        }
+        else
+        {
+            if (S)
+            {
+                UBFX(tmp, op2.Reg.Rm, amount - 1, 1);
+                BFI(RCPSR, tmp, 29, 1);
+            }
+            op2 = Op2(op2.Reg.Rm, ST_ROR, amount);
+        }
+        return;
+    }
+}
+
+void Compiler::Comp_RetriveFlags(bool retriveCV)
+{
+    if (CurInstr.SetFlags)
+        CPSRDirty = true;
+
+    if (CurInstr.SetFlags & 0x4)
+    {
+        CSET(W0, CC_EQ);
+        BFI(RCPSR, W0, 30, 1);
+    }
+    if (CurInstr.SetFlags & 0x8)
+    {
+        CSET(W0, CC_MI);
+        BFI(RCPSR, W0, 31, 1);
+    }
+    if (retriveCV)
+    {
+        if (CurInstr.SetFlags & 0x2)
+        {
+            CSET(W0, CC_CS);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        if (CurInstr.SetFlags & 0x1)
+        {
+            CSET(W0, CC_VS);
+            BFI(RCPSR, W0, 28, 1);
+        }
+    }
+}
+
+void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    switch (op)
+    {
+    case 0x0: // AND
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, op2.Imm, W0);
+            else
+                AND(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x1: // EOR
+        if (op2.IsImm)
+            EORI2R(rd, rn, op2.Imm, W0);
+        else
+            EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xC: // ORR
+        if (op2.IsImm)
+            ORRI2R(rd, rn, op2.Imm, W0);
+        else
+            ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xE: // BIC
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    bool CVInGP = false;
+    switch (op)
+    {
+    case 0x2: // SUB
+        if (S)
+        {
+            if (op2.IsImm)
+                SUBSI2R(rd, rn, op2.Imm, W0);
+            else
+                SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+            {
+                MOVI2R(W2, op2.Imm);
+                SUBI2R(rd, rn, op2.Imm, W0);
+            }
+            else
+                SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            op2 = Op2(WZR);
+        }
+        else if (op2.IsImm)
+        {
+            MOVI2R(W1, op2.Imm);
+            op2 = Op2(W1);
+        }
+        else if (op2.Reg.ShiftAmount != 0)
+        {
+            MOV(W1, op2.Reg.Rm, op2.ToArithOption());
+            op2 = Op2(W1);
+        }
+
+        if (S)
+            SUBS(rd, op2.Reg.Rm, rn);
+        else
+            SUB(rd, op2.Reg.Rm, rn);
+        break;
+    case 0x4: // ADD
+        if (S)
+        {
+            if (op2.IsImm)
+                ADDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ADDI2R(rd, rn, op2.Imm, W0);
+            else
+                ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x5: // ADC
+        UBFX(W2, RCPSR, 29, 1);
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, rn, W2);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm, W0);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, rn, W2);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm, W0);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x6: // SBC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -op2 - 1
+        if (op2.IsImm)
+            MOVI2R(W1, ~op2.Imm);
+        else
+            ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            ADDS(rd, rn, W1);
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            ADD(rd, rn, W1);
+        }
+        break;
+    case 0x7: // RSC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -rn - 1
+        MVN(W1, rn);
+        if (S)
+        {
+            CVInGP = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+    {
+        if (CVInGP)
+        {
+            BFI(RCPSR, W2, 29, 1);
+            BFI(RCPSR, W3, 28, 1);
+        }
+        Comp_RetriveFlags(!CVInGP);
+    }
+}
+
+void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    switch (op)
+    {
+    case 0x8: // TST
+        if (op2.IsImm)
+            TSTI2R(rn, op2.Imm, W0);
+        else
+            ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0x9: // TEQ
+        if (op2.IsImm)
+            EORI2R(W0, rn, op2.Imm, W0);
+        else
+            EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption());
+        TST(W0, W0);
+        break;
+    case 0xA: // CMP
+        if (op2.IsImm)
+            CMPI2R(rn, op2.Imm, W0);
+        else
+            CMP(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0xB: // CMN
+        if (op2.IsImm)
+            ADDSI2R(WZR, rn, op2.Imm, W0);
+        else
+            CMN(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    }
+
+    Comp_RetriveFlags(op >= 0xA);
+}
+
+// also counts cycles!
+void Compiler::A_Comp_GetOp2(bool S, Op2& op2)
+{
+    if (CurInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));    
+    }
+    else
+    {
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        op2.Reg.Rm = MapReg(CurInstr.A_Reg(0));
+        if (CurInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+
+            ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+            if (CurInstr.A_Reg(0) == 15)
+            {
+                ADD(W0, op2.Reg.Rm, 4);
+                op2.Reg.Rm = W0;
+            }
+            Comp_RegShiftReg(op, S, op2, rs);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+
+            int amount = (CurInstr.Instr >> 7) & 0x1F;
+            Comp_RegShiftImm(op, amount, S, op2);
+        }
+    }
+}
+
+void Compiler::A_Comp_ALUCmpOp()
+{
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(op <= 0x9, op2);
+    
+    Comp_Compare(op, rn, op2);
+}
+
+void Compiler::A_Comp_ALUMovOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    Op2 op2;
+    A_Comp_GetOp2(S, op2);
+
+    if (op == 0xF) // MVN
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm);
+            MOVI2R(rd, ~op2.Imm);
+        }
+        else
+            ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption());
+    }
+    else // MOV
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm);
+            MOVI2R(rd, op2.Imm);
+        }
+        else
+            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+    }
+
+    if (S)
+    {
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_ALUTriOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    bool logical = (1 << op) & 0xF303;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(S && logical, op2);
+
+    if (op2.IsImm && op2.Imm == 0)
+        op2 = Op2(WZR, ST_LSL, 0);
+    
+    if (logical)
+        Comp_Logical(op, S, rd, rn, op2);
+    else
+        Comp_Arithmetic(op, S, rd, rn, op2);
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_Clz()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+
+    CLZ(rd, rm);
+
+    assert(Num == 0);
+}
+
+void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn)
+{
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        CLZ(W0, rs);
+        CLS(W1, rs);
+        CMP(W0, W1);
+        CSEL(W0, W0, W1, CC_GT);
+        Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (mla)
+        MADD(rd, rm, rs, rn);
+    else
+        MUL(rd, rm, rs);
+
+    if (S && FlagsNZNeeded())
+    {
+        TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+}
+
+void Compiler::A_Comp_Mul_Long()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
+
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        CLZ(W0, rs);
+        CLS(W1, rs);
+        CMP(W0, W1);
+        CSEL(W0, W0, W1, CC_GT);
+        Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (add)
+    {
+        MOV(W0, rn);
+        BFI(X0, EncodeRegTo64(rd), 32, 32);
+        if (sign)
+            SMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        else
+            UMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    else
+    {
+        if (sign)
+            SMULL(EncodeRegTo64(rn), rm, rs);
+        else
+            UMULL(EncodeRegTo64(rn), rm, rs);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::A_Comp_Mul()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool mla = CurInstr.Instr & (1 << 21);
+    ARM64Reg rn = INVALID_REG;
+    if (mla)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_Mul_Mla(S, mla, rd, rm, rs, rn);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    Comp_AddCycles_C();
+
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    Op2 op2;
+    op2.Reg.Rm = MapReg(CurInstr.T_Reg(3));
+    Comp_RegShiftImm(op, amount, true, op2);
+    if (op2.IsImm)
+        MOVI2R(rd, op2.Imm);
+    else
+        MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+    if (FlagsNZNeeded())
+        TST(rd, rd);
+
+    Comp_RetriveFlags(false);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    Comp_AddCycles_C();
+
+    Op2 op2;
+    if (CurInstr.Instr & (1 << 10))
+        op2 = Op2((CurInstr.Instr >> 6) & 0x7);
+    else
+        op2 = Op2(MapReg(CurInstr.T_Reg(6)));
+    
+    Comp_Arithmetic(
+        CurInstr.Instr & (1 << 9) ? 0x2 : 0x4,
+        true,
+        MapReg(CurInstr.T_Reg(0)),
+        MapReg(CurInstr.T_Reg(3)),
+        op2);
+}
+
+void Compiler::T_Comp_ALUImm8()
+{
+    Comp_AddCycles_C();
+
+    u32 imm = CurInstr.Instr & 0xFF;
+    int op = (CurInstr.Instr >> 11) & 0x3;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+
+    switch (op)
+    {
+    case 0:
+        MOVI2R(rd, imm);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    case 1:
+        Comp_Compare(0xA, rd, Op2(imm));
+        break;
+    case 2:
+    case 3:
+        Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm));
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU()
+{
+    int op = (CurInstr.Instr >> 6) & 0xF;
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.T_Reg(3));
+    
+    if ((op >= 0x2 && op <= 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1);
+    else
+        Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0:
+        Comp_Logical(0x0, true, rd, rd, Op2(rs));
+        break;
+    case 0x1:
+        Comp_Logical(0x1, true, rd, rd, Op2(rs));
+        break;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {   
+            Op2 op2;
+            op2.Reg.Rm = rd;
+            Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs);
+            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            if (FlagsNZNeeded())
+                TST(rd, rd);
+            Comp_RetriveFlags(false);
+        }
+        break;
+    case 0x5:
+        Comp_Arithmetic(0x5, true, rd, rd, Op2(rs));
+        break;
+    case 0x6:
+        Comp_Arithmetic(0x6, true, rd, rd, Op2(rs));
+        break;
+    case 0x8:
+        Comp_Compare(0x8, rd, Op2(rs));
+        break;
+    case 0x9:
+        Comp_Arithmetic(0x3, true, rd, rs, Op2(0));
+        break;
+    case 0xA:
+        Comp_Compare(0xA, rd, Op2(rs));
+        break;
+    case 0xB:
+        Comp_Compare(0xB, rd, Op2(rs));
+        break;
+    case 0xC:
+        Comp_Logical(0xC, true, rd, rd, Op2(rs));
+        break;
+    case 0xD:
+        Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG);
+        break;
+    case 0xE:
+        Comp_Logical(0xE, true, rd, rd, Op2(rs));
+        break;
+    case 0xF:
+        MVN(rd, rs);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0:
+        Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs));
+        break;
+    case 1:
+        Comp_Compare(0xA, rdMapped, rs);
+        return;
+    case 2:
+        MOV(rdMapped, rs);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        Comp_JumpTo(rdMapped, false, false);
+    }
+}
+
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg sp = MapReg(13);
+    u32 offset = (CurInstr.Instr & 0x7F) << 2;
+    if (CurInstr.Instr & (1 << 7))
+        SUB(sp, sp, offset);
+    else
+        ADD(sp, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        ARM64Reg sp = MapReg(13);
+        ADD(rd, sp, offset);
+    }
+    else
+        MOVI2R(rd, (R15 & ~2) + offset);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..542f0b7
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -0,0 +1,452 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+// hack
+const int kCodeCacheTiming = 3;
+
+namespace ARMJIT
+{
+
+template <typename T>
+void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR)
+{
+    cpu->JumpTo(addr, changeCPSR);
+}
+
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    // it's not completely safe to assume stuff like, which instructions to preload
+    // we'll see how it works out
+
+    IrregularCycles = true;
+
+    u32 newPC;
+    u32 cycles = 0;
+    bool setupRegion = false;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        ORRI2R(RCPSR, RCPSR, 0x20);
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        ANDI2R(RCPSR, RCPSR, ~0x20);
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 oldregion = R15 >> 24;
+        u32 newregion = addr >> 24;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        MOVI2R(W0, regionCodeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+        setupRegion = newregion != oldregion;
+        if (setupRegion)
+            cpu9->SetupCodeMem(addr);
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                cpu9->CodeRead32(addr-2, true) >> 16;
+                cycles += cpu9->CodeCycles;
+                cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                cpu9->CodeRead32(addr, true);
+                cycles += cpu9->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+        if (setupRegion)
+            cpu9->SetupCodeMem(R15);
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        MOVI2R(W0, codeRegion);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion));
+        MOVI2R(W0, codeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
+    }
+
+    if (Exit)
+    {
+        MOVI2R(W0, newPC);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+    }
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+
+void* Compiler::Gen_JumpTo9(int kind)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    MOVI2R(W2, kCodeCacheTiming);
+    // W1 - code cycles non branch
+    // W2 - branch code cycles
+    LSR(W1, W0, 12);
+    LSL(W1, W1, 2);
+    ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
+    LDRB(W1, RCPU, W1);
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+    CMP(W0, W3);
+    FixupBranch outsideITCM = B(CC_LO);
+    MOVI2R(W1, 1);
+    MOVI2R(W2, 1);
+    SetJumpTarget(outsideITCM);
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+
+    if (kind == 0 || kind == 1)
+    {
+        ANDI2R(W0, W0, ~3);
+
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ADD(W3, W0, 4);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        ADD(W1, W1, W2);
+        ADD(RCycles, RCycles, W1);
+
+        RET();
+    }
+    if (kind == 0 || kind == 2)
+    {
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        ANDI2R(W0, W0, ~1);
+
+        ADD(W3, W0, 2);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        FixupBranch halfwordLoc = TBZ(W0, 1);
+        ADD(W1, W1, W2);
+        ADD(RCycles, RCycles, W1);
+        RET();
+
+        SetJumpTarget(halfwordLoc);
+        ADD(RCycles, RCycles, W2);
+        RET();
+    }
+
+    return res;
+}
+
+void* Compiler::Gen_JumpTo7(int kind)
+{
+    void* res = GetRXPtr();
+
+    LSR(W1, W0, 24);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion));
+    LSR(W1, W0, 15);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles));
+
+    MOVP2R(X2, NDS::ARM7MemTimings);
+    LDR(W3, X2, ArithOption(W1, true));
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+    
+    if (kind == 0 || kind == 1)
+    {
+        UBFX(W2, W3, 0, 8);
+        UBFX(W3, W3, 8, 8);
+        ADD(W2, W3, W2);
+        ADD(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~3);
+
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ADD(W3, W0, 4);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+    if (kind == 0 || kind == 2)
+    {
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        UBFX(W2, W3, 16, 8);
+        UBFX(W3, W3, 24, 8);
+        ADD(W2, W3, W2);
+        ADD(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~1);
+
+        ADD(W3, W0, 2);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+
+    return res;
+}
+
+void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR)
+{
+    IrregularCycles = true;
+
+    if (!restoreCPSR)
+    {
+        if (switchThumb)
+            CPSRDirty = true;
+        MOV(W0, addr);
+        BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]);
+    }
+    else
+    {
+        BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
+        bool previouslyDirty = CPSRDirty;
+        SaveCPSR();
+
+        if (restoreCPSR)
+        {
+            if (Thumb || CurInstr.Cond() >= 0xE)
+                RegCache.Flush();
+            else
+            {
+                // the ugly way...
+                // we only save them, to load and save them again
+                for (int reg : hiRegsLoaded)
+                    SaveReg(reg, RegCache.Mapping[reg]);
+            }
+        }
+
+        if (switchThumb)
+            MOV(W1, addr);
+        else
+        {
+            if (Thumb)
+                ORRI2R(W1, addr, 1);
+            else
+                ANDI2R(W1, addr, ~1);
+        }
+        MOV(X0, RCPU);
+        MOVI2R(W2, restoreCPSR);
+        if (Num == 0)
+            QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
+        else
+            QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
+        
+        if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
+        {
+            for (int reg : hiRegsLoaded)
+                LoadReg(reg, RegCache.Mapping[reg]);
+        }
+
+        if (previouslyDirty)
+            LoadCPSR();
+        CPSRDirty = previouslyDirty;
+    }
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOVI2R(MapReg(14), R15 - 4);
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(W0, rn);
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOVI2R(MapReg(14), R15 - 4);
+    Comp_JumpTo(W0, true);
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    Comp_BranchSpecialBehaviour();
+
+    FixupBranch skipFailed = B();
+    SetJumpTarget(skipExecute);
+    Comp_AddCycles_C(true);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        SaveCPSR(false);
+        RegCache.PrepareExit();
+        
+        ADD(W0, RCycles, ConstantCycles);
+        ABI_PopRegisters(SavedRegs);
+        RET();
+    }
+
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+
+    if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(W0, MapReg(CurInstr.A_Reg(3)));
+        MOVI2R(MapReg(14), R15 - 1);
+        Comp_JumpTo(W0, true);
+    }
+    else
+    {
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn, true);
+    }
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOVI2R(MapReg(14), R15 + offset);
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    ARM64Reg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    ADD(W0, lr, offset);
+    MOVI2R(lr, (R15 - 2) | 1);
+    Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12)));
+}
+
+void Compiler::T_Comp_BL_Merged()
+{
+    Comp_AddCycles_C();
+
+    R15 += 2;
+
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
+        target |= 1;
+
+    MOVI2R(MapReg(14), (R15 - 2) | 1);
+    
+    Comp_JumpTo(target);
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..89d0029
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -0,0 +1,707 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include "../ARMJIT_Internal.h"
+
+#ifdef __SWITCH__
+#include "../switch/compat_switch.h"
+
+extern char __start__;
+#endif
+
+#include <malloc.h>
+
+using namespace Arm64Gen;
+
+
+namespace ARMJIT
+{
+
+/*
+
+    Recompiling classic ARM to ARMv8 code is at the same time
+    easier and trickier than compiling to a less related architecture
+    like x64. At one hand you can translate a lot of instructions directly.
+    But at the same time, there are a ton of exceptions, like for
+    example ADD and SUB can't have a RORed second operand on ARMv8.
+ */
+
+template <>
+const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
+    {W19, W20, W21, W22, W23, W24, W25, W26};
+template <>
+const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
+
+const int JitMemSize = 16 * 1024 * 1024;
+
+void Compiler::MovePC()
+{
+    ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
+}
+
+Compiler::Compiler()
+{
+#ifdef __SWITCH__
+    JitRWBase = memalign(0x1000, JitMemSize);
+
+    JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000;
+    JitRWStart = virtmemReserve(JitMemSize);
+    MemoryInfo info = {0};
+    u32 pageInfo = {0};
+    int i = 0;
+    while (JitRXStart != NULL)
+    {
+        svcQueryMemory(&info, &pageInfo, (u64)JitRXStart);
+        if (info.type != MemType_Unmapped)
+            JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000);
+        else
+            break;
+        if (i++ > 8)
+        {
+            printf("couldn't find unmapped place for jit memory\n");
+            JitRXStart = NULL;
+        }
+    }
+
+    assert(JitRXStart != NULL);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+    assert(succeded);
+
+    SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
+    JitMemUseableSize = JitMemSize;
+    Reset();
+#endif
+
+    for (int i = 0; i < 3; i++)
+    {
+        for (int j = 0; j < 2; j++)
+        {
+            MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
+        }
+    }
+    MemFunc7[0][0] = (void*)NDS::ARM7Read8;
+    MemFunc7[1][0] = (void*)NDS::ARM7Read16;
+    MemFunc7[2][0] = (void*)NDS::ARM7Read32;
+    MemFunc7[0][1] = (void*)NDS::ARM7Write8;
+    MemFunc7[1][1] = (void*)NDS::ARM7Write16;
+    MemFunc7[2][1] = (void*)NDS::ARM7Write32;
+
+    for (int i = 0; i < 2; i++)
+    {
+        for (int j = 0; j < 2; j++)
+        {
+            MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
+            MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
+        }
+    }
+
+    for (int i = 0; i < 3; i++)
+    {
+        JumpToFuncs9[i] = Gen_JumpTo9(i);
+        JumpToFuncs7[i] = Gen_JumpTo7(i);
+    }
+
+    /*
+        W0 - mode
+        W1 - reg num
+        W3 - in/out value of reg
+    */
+    {
+        ReadBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        CMP(W0, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W0, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W0, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W0, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W0, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        RET();
+        SetJumpTarget(irq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        RET();
+        SetJumpTarget(svc);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        RET();
+        SetJumpTarget(abt);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        RET();
+        SetJumpTarget(und);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        RET();
+    }
+    {
+        WriteBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        CMP(W0, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W0, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W0, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W0, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W0, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        MOVI2R(W4, 0);
+        RET();
+
+        SetJumpTarget(fiq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(irq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(svc);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(abt);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(und);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        MOVI2R(W4, 1);
+        RET();
+    }
+
+    //FlushIcache();
+
+    JitMemUseableSize -= GetCodeOffset();
+    SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
+}
+
+Compiler::~Compiler()
+{
+#ifdef __SWITCH__
+    if (JitRWStart != NULL)
+    {
+        bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+        assert(succeded);
+        virtmemFree(JitRWStart, JitMemSize);
+        succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+        assert(succeded);
+        free(JitRWBase);
+    }
+#endif
+}
+
+void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
+{
+    if (reg == 15)
+        MOVI2R(nativeReg, R15);
+    else
+        LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::SaveReg(int reg, ARM64Reg nativeReg)
+{
+    STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+    LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+}
+
+void Compiler::SaveCPSR(bool markClean)
+{
+    if (CPSRDirty)
+    {
+        STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+        CPSRDirty = CPSRDirty && !markClean;
+    }
+}
+
+FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    if (cond >= 0x8)
+    {
+        LSR(W1, RCPSR, 28);
+        MOVI2R(W2, 1);
+        LSLV(W2, W2, W1);
+        ANDI2R(W2, W2, ARM::ConditionTable[cond], W3);
+
+        return CBZ(W2);
+    }
+    else
+    {
+        u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)));
+
+        if (cond & 1)
+            return TBNZ(RCPSR, bit);
+        else
+            return TBZ(RCPSR, bit);
+    }
+}
+
+#define F(x) &Compiler::A_Comp_##x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // EOR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SUB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADD
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SBC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ORR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MOV
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // BIC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MVN
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // TST
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // TEQ
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMP
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMN
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // Mul
+    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, 
+    // ARMv5 exclusives
+    F(Clz), NULL, NULL, NULL, NULL, 
+    
+    // STR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRD
+    NULL, NULL, NULL, NULL,
+    // STRD
+    NULL, NULL, NULL, NULL,
+    // LDRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSB
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // Swap
+    NULL, NULL,
+    // LDM, STM
+    F(LDM_STM), F(LDM_STM),
+    // Branch
+    F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
+    // Special
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+};
+#undef F
+#define F(x) &Compiler::T_Comp_##x
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] =
+{
+    // Shift imm
+    F(ShiftImm), F(ShiftImm), F(ShiftImm),
+    // Add/sub tri operand
+    F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_),
+    // 8 bit imm
+    F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8),
+    // ALU
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    // ALU hi reg
+    F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg),
+    // PC/SP relative ops
+    F(RelAddr), F(RelAddr), F(AddSP),
+    // LDR PC rel
+    F(LoadPCRel),
+    // LDR/STR reg offset
+    F(MemReg), F(MemReg), F(MemReg), F(MemReg),
+    // LDR/STR sign extended, half
+    F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf),
+    // LDR/STR imm offset
+    F(MemImm), F(MemImm), F(MemImm), F(MemImm),
+    // LDR/STR half imm offset
+    F(MemImmHalf), F(MemImmHalf),
+    // LDR/STR sp rel
+    F(MemSPRel), F(MemSPRel),
+    // PUSH/POP
+    F(PUSH_POP), F(PUSH_POP),
+    // LDMIA, STMIA
+    F(LDMIA_STMIA), F(LDMIA_STMIA),
+    // Branch
+    F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2),
+    // Unk, SVC
+    NULL, NULL,
+    F(BL_Merged)
+};
+
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
+void Compiler::Comp_BranchSpecialBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+    {
+        MOVI2R(W0, 1);
+        STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
+    }
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        SaveCPSR(false);
+        RegCache.PrepareExit();
+        ADD(W0, RCycles, ConstantCycles);
+        ABI_PopRegisters(SavedRegs);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+{
+    if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+    {
+        printf("JIT memory full, resetting...\n");
+        ResetBlockCache();
+    }
+
+    JitBlockEntry res = (JitBlockEntry)GetRXPtr();
+
+    Thumb = thumb;
+    Num = cpu->Num;
+    CurCPU = cpu;
+    ConstantCycles = 0;
+    RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
+
+    //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
+    const u32 ALL_CALLEE_SAVED = 0x7FF80000;
+
+    SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
+
+    //if (Num == 1)
+    {
+        ABI_PushRegisters(SavedRegs);
+
+        MOVP2R(RCPU, CurCPU);
+        MOVI2R(RCycles, 0);
+
+        LoadCPSR();
+    }
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
+
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
+        Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
+        //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags);
+
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        {
+            MOVI2R(W0, R15);
+            STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+            if (comp == NULL)
+            {
+                MOVI2R(W0, CurInstr.Instr);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr));
+            }
+            if (Num == 0)
+            {
+                MOVI2R(W0, (s32)CurInstr.CodeCycles);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+            }
+        }
+
+        if (comp == NULL)
+        {
+            SaveCPSR();
+            RegCache.Flush();
+        }
+        else
+            RegCache.Prepare(Thumb, i);
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(X0, RCPU);
+                QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]);
+            }
+            else
+                (this->*comp)();
+        }
+        else
+        {
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM);
+                }
+            }
+            else if (cond == 0xF)
+                Comp_AddCycles_C();
+            else
+            {
+                IrregularCycles = false;
+
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                    skipExecute = CheckCondition(cond);
+
+                if (comp == NULL)
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]);
+                }
+                else
+                {
+                    (this->*comp)();
+                }
+
+                Comp_BranchSpecialBehaviour();
+
+                if (cond < 0xE)
+                {
+                    if (IrregularCycles)
+                    {
+                        FixupBranch skipNop = B();
+                        SetJumpTarget(skipExecute);
+
+                        Comp_AddCycles_C();
+
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            SaveCPSR(false);
+                            RegCache.PrepareExit();
+                            ADD(W0, RCycles, ConstantCycles);
+                            ABI_PopRegisters(SavedRegs);
+                            RET();
+                        }
+
+                        SetJumpTarget(skipNop);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
+                }
+
+            }
+        }
+
+        if (comp == NULL)
+            LoadCPSR();
+    }
+
+    RegCache.Flush();
+
+    //if (Num == 1)
+    {
+        SaveCPSR();
+
+        ADD(W0, RCycles, ConstantCycles);
+
+        ABI_PopRegisters(SavedRegs);
+    }
+    //else
+    //    ADD(RCycles, RCycles, ConstantCycles);
+
+    RET();
+
+    FlushIcache();
+
+    //printf("finished\n");
+
+    return res;
+}
+
+void Compiler::Reset()
+{
+    SetCodePtr(0);
+
+    const u32 brk_0 = 0xD4200000;
+
+    for (int i = 0; i < JitMemUseableSize / 4; i++)
+        *(((u32*)GetRWPtr()) + i) = brk_0;
+}
+
+void Compiler::Comp_AddCycles_C(bool nonConst)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+
+    if (!nonConst && !CurInstr.Info.Branches())
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 numI)
+{
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
+
+    if (Thumb || CurInstr.Cond() >= 0xE)
+        ConstantCycles += cycles;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
+{
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
+
+    ADD(RCycles, RCycles, numI, shift);
+    if (Thumb || CurInstr.Cond() >= 0xE)
+        ConstantCycles += c;
+    else
+        ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(RCycles, RCycles, cycles);
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
+        ADD(RCycles, RCycles, cycles);
+    else
+        ConstantCycles += cycles;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..7e13507
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -0,0 +1,234 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../ARM.h"
+#include "../ARMJIT.h"
+
+#include "../dolphin/Arm64Emitter.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMJIT_RegisterCache.h"
+
+namespace ARMJIT
+{
+
+const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27;
+const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28;
+const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29;
+
+struct Op2
+{
+    Op2()
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = Arm64Gen::ST_LSL;
+        Reg.ShiftAmount = 0;
+    }
+
+    Op2(u32 imm) : IsImm(true), Imm(imm)
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = st;
+        Reg.ShiftAmount = amount;
+    }
+
+    Arm64Gen::ArithOption ToArithOption()
+    {
+        assert(!IsImm);
+        return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount);
+    }
+
+    bool IsSimpleReg()
+    { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; }
+    bool ImmFits12Bit()
+    { return IsImm && (Imm & 0xFFF == Imm); }
+    bool IsZero()
+    { return IsImm && !Imm; }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            Arm64Gen::ARM64Reg Rm;
+            Arm64Gen::ShiftType ShiftType;
+            int ShiftAmount;
+        } Reg;
+        u32 Imm;
+    };
+};
+
+class Compiler : Arm64Gen::ARM64XEmitter
+{
+public:
+    typedef void (Compiler::*CompileFunc)();
+
+    Compiler();
+    ~Compiler();
+
+    Arm64Gen::ARM64Reg MapReg(int reg)
+    {
+        assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
+        return RegCache.Mapping[reg];
+    }
+
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+
+    bool CanCompile(bool thumb, u16 kind);
+
+    bool FlagsNZNeeded()
+    {
+        return CurInstr.SetFlags & 0xC;
+    }
+
+    void Reset();
+
+    void Comp_AddCycles_C(bool forceNonConst = false);
+    void Comp_AddCycles_CI(u32 numI);
+    void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
+    void Comp_AddCycles_CD();
+    void Comp_AddCycles_CDI();
+
+    void MovePC();
+
+    void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+    void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+
+    void LoadCPSR();
+    void SaveCPSR(bool markClean = true);
+
+    void A_Comp_ALUTriOp();
+    void A_Comp_ALUMovOp();
+    void A_Comp_ALUCmpOp();
+
+    void A_Comp_Mul();
+    void A_Comp_Mul_Long();
+
+    void A_Comp_Clz();
+
+    void A_Comp_MemWB();
+    void A_Comp_MemHD();
+
+    void A_Comp_LDM_STM();
+    
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
+
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALUImm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
+    void T_Comp_AddSP();
+    void T_Comp_RelAddr();
+
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+
+    void T_Comp_LDMIA_STMIA();
+    void T_Comp_PUSH_POP();
+
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged();
+
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+
+    void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn);
+
+    void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+
+    void Comp_RetriveFlags(bool retriveCV);
+
+    Arm64Gen::FixupBranch CheckCondition(u32 cond);
+
+    void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
+
+    void A_Comp_GetOp2(bool S, Op2& op2);
+
+    void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
+    void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
+
+    void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
+
+    void* Gen_MemoryRoutine9(int size, bool store);
+
+    void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
+    void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
+
+    // 0 = switch mode, 1 = stay arm, 2 = stay thumb
+    void* Gen_JumpTo9(int kind);
+    void* Gen_JumpTo7(int kind);
+
+    void Comp_BranchSpecialBehaviour();
+
+    bool Exit;
+
+    FetchedInstr CurInstr;
+    bool Thumb;
+    u32 R15;
+    u32 Num;
+    ARM* CurCPU;
+    u32 ConstantCycles;
+    u32 CodeRegion;
+
+    BitSet32 SavedRegs;
+
+    u32 JitMemUseableSize;
+
+    void* ReadBanked, *WriteBanked;
+
+    // [size][store]
+    void* MemFunc9[3][2];
+    void* MemFunc7[3][2];
+
+    // [store][pre increment]
+    void* MemFuncsSeq9[2][2];
+    // "[code in main ram]
+    void* MemFuncsSeq7[2][2];
+
+    void* JumpToFuncs9[3];
+    void* JumpToFuncs7[3];
+
+    RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
+
+    bool CPSRDirty = false;
+
+    bool IrregularCycles = false;
+
+#ifdef __SWITCH__
+    void* JitRWBase;
+    void* JitRWStart;
+    void* JitRXStart;
+#endif
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..a5d0e3f
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,848 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../Config.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+// W0 - address
+// (if store) W1 - value to store
+// W2 - code cycles
+void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    u32 addressMask;
+    switch (size)
+    {
+    case 32: addressMask = ~3; break;
+    case 16: addressMask = ~1; break;
+    case 8:  addressMask = ~0; break;
+    }
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
+    SUB(W3, W0, W3);
+    CMP(W3, W4);
+    FixupBranch insideDTCM = B(CC_LO);
+
+    UBFX(W4, W0, 24, 8);
+    CMP(W4, 0x02);
+    FixupBranch outsideMainRAM = B(CC_NEQ);
+    ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
+    MOVP2R(X4, NDS::MainRAM);
+    if (!store && size == 32)
+    {
+        LDR(W3, X3, X4);
+        ANDI2R(W0, W0, 3);
+        LSL(W0, W0, 3);
+        RORV(W0, W3, W0);
+    }
+    else if (store)
+        STRGeneric(size, W1, X3, X4);
+    else
+        LDRGeneric(size, false, W0, X3, X4);
+    RET();
+
+    SetJumpTarget(outsideMainRAM);
+
+    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+    CMP(W0, W3);
+    FixupBranch insideITCM = B(CC_LO);
+
+    if (store)
+    {
+        if (size > 8)
+            ANDI2R(W0, W0, addressMask);
+
+        switch (size)
+        {
+        case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
+        case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
+        case 8:  QuickTailCall(X4, NDS::ARM9Write8);  break;
+        }
+    }
+    else
+    {
+        if (size == 32)
+            ABI_PushRegisters({0, 30});
+        if (size > 8)
+            ANDI2R(W0, W0, addressMask);
+
+        switch (size)
+        {
+        case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
+        case 16: QuickTailCall    (X4, NDS::ARM9Read16); break;
+        case 8:  QuickTailCall    (X4, NDS::ARM9Read8 ); break;
+        }
+        if (size == 32)
+        {
+            ABI_PopRegisters({1, 30});
+            ANDI2R(W1, W1, 3);
+            LSL(W1, W1, 3);
+            RORV(W0, W0, W1);
+            RET();
+        }
+    }
+
+    SetJumpTarget(insideDTCM);
+    ANDI2R(W3, W3, 0x3FFF & addressMask);
+    ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
+    if (!store && size == 32)
+    {
+        ANDI2R(W4, W0, 3);
+        LDR(W0, RCPU, W3);
+        LSL(W4, W4, 3);
+        RORV(W0, W0, W4);
+    }
+    else if (store)
+        STRGeneric(size, W1, RCPU, W3);
+    else
+        LDRGeneric(size, false, W0, RCPU, W3);
+    
+    RET();
+
+    SetJumpTarget(insideITCM);
+    ANDI2R(W3, W0, 0x7FFF & addressMask);
+    if (store)
+    {
+        LSR(W0, W3, 8);
+        ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        MOVP2R(X4, CodeRanges);
+        ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4));
+        static_assert(sizeof(AddressRange) == 16);
+        LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
+        FixupBranch null = CBZ(W4);
+        ABI_PushRegisters({1, 3, 30});
+        QuickCallFunction(X4, InvalidateByAddr);
+        ABI_PopRegisters({1, 3, 30});
+        SetJumpTarget(null);
+    }
+    ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
+    if (!store && size == 32)
+    {
+        ANDI2R(W4, W0, 3);
+        LDR(W0, RCPU, W3);
+        LSL(W4, W4, 3);
+        RORV(W0, W0, W4);
+    }
+    else if (store)
+        STRGeneric(size, W1, RCPU, W3);
+    else
+        LDRGeneric(size, false, W0, RCPU, W3);
+    RET();
+
+    return res;
+}
+
+/*
+    W0 - base address
+    X1 - stack space
+    W2 - values count
+*/
+void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+    
+    void* loopStart = GetRXPtr();
+    SUB(W2, W2, 1);
+
+    if (preinc)
+        ADD(W0, W0, 4);
+
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
+    LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
+    SUB(W4, W0, W4);
+    CMP(W4, W5);
+    FixupBranch insideDTCM = B(CC_LO);
+
+    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
+    CMP(W0, W4);
+    FixupBranch insideITCM = B(CC_LO);
+
+    ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
+    if (store)
+    {
+        LDR(X1, X1, ArithOption(X2, true));
+        QuickCallFunction(X4, NDS::ARM9Write32);
+
+        ABI_PopRegisters({0, 1, 2, 30});
+    }
+    else
+    {
+        QuickCallFunction(X4, NDS::ARM9Read32);
+        MOV(W4, W0);
+
+        ABI_PopRegisters({0, 1, 2, 30});
+
+        STR(X4, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    SetJumpTarget(insideDTCM);
+
+    ANDI2R(W4, W4, ~3 & 0x3FFF);
+    ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
+    if (store)
+    {
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X4);
+    }
+    else
+    {
+        LDR(W5, RCPU, X4);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    SetJumpTarget(insideITCM);
+
+    ANDI2R(W4, W0, ~3 & 0x7FFF);
+
+    if (store)
+    {
+        LSR(W6, W4, 8);
+        ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        MOVP2R(X5, CodeRanges);
+        ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
+        static_assert(sizeof(AddressRange) == 16);
+        LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
+        FixupBranch null = CBZ(W5);
+        ABI_PushRegisters({0, 1, 2, 4, 30});
+        MOV(W0, W6);
+        QuickCallFunction(X5, InvalidateByAddr);
+        ABI_PopRegisters({0, 1, 2, 4, 30});
+        SetJumpTarget(null);
+    }
+
+    ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5);
+    if (store)
+    {
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X4);
+    }
+    else
+    {
+        LDR(W5, RCPU, X4);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+    return res;
+}
+
+void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    void* loopStart = GetRXPtr();
+    SUB(W2, W2, 1);
+
+    if (preinc)
+        ADD(W0, W0, 4);
+
+    ABI_PushRegisters({0, 1, 2, 30});
+    if (store)
+    {
+        LDR(X1, X1, ArithOption(X2, true));
+        QuickCallFunction(X4, NDS::ARM7Write32);
+        ABI_PopRegisters({0, 1, 2, 30});
+    }
+    else
+    {
+        QuickCallFunction(X4, NDS::ARM7Read32);
+        MOV(W4, W0);
+        ABI_PopRegisters({0, 1, 2, 30});
+        STR(X4, X1, ArithOption(X2, true));
+    }
+
+    if (!preinc)
+        ADD(W0, W0, 4);
+    CBNZ(W2, loopStart);
+    RET();
+
+    return res;
+}
+
+void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+    {
+        CurCPU->DataRead16(addr & ~0x1, &val);
+        if (signExtend)
+            val = ((s32)val << 16) >> 16;
+    }
+    else
+    {
+        CurCPU->DataRead8(addr, &val);
+        if (signExtend)
+            val = ((s32)val << 24) >> 24;
+    }
+    CurCPU->R[15] = tmpR15;
+
+    MOVI2R(MapReg(rd), val);
+
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+}
+
+void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
+{
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
+    
+    if (flags & memop_Store)
+        Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
+        {
+            Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
+            return;
+        }
+    }
+
+    {
+        ARM64Reg rdMapped = MapReg(rd);
+        ARM64Reg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memFunc = Num == 0
+            ? MemFunc9[size >> 4][!!(flags & memop_Store)]
+            : MemFunc7[size >> 4][!!((flags & memop_Store))];
+
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                MOVP2R(X0, ptr);
+                if (flags & memop_Store)
+                    STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
+                else
+                {
+                    LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
+                    if (size == 32 && addr & ~0x3)
+                        ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
+                }
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        ARM64Reg finalAddr = W0;
+        if (flags & memop_Post)
+        {
+            finalAddr = rnMapped;
+            MOV(W0, rnMapped);
+        }
+
+        if (flags & memop_Store)
+            MOV(W1, rdMapped);
+
+        if (!offset.IsImm)
+            Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+        // offset might become an immediate
+        if (offset.IsImm)
+        {
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Imm);
+            else
+                ADD(finalAddr, rnMapped, offset.Imm);
+        }
+        else
+        {
+            if (offset.Reg.ShiftType == ST_ROR)
+            {
+                ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+                offset = Op2(W0);
+            }
+
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+            else
+                ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+        }
+
+        if (!(flags & memop_Post) && (flags & memop_Writeback))
+            MOV(rnMapped, W0);
+
+        if (inlinePreparation)
+        {
+            if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
+                ANDI2R(rdMapped, W0, 3);
+            if (size > 8)
+                ANDI2R(W0, W0, addressMask);
+        }
+        QuickCallFunction(X2, memFunc);
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && !(flags & memop_Store) && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    LSL(rdMapped, rdMapped, 3);
+                    RORV(rdMapped, W0, rdMapped);
+                }
+                else if (constLocalROR32 > 0)
+                    ROR_(rdMapped, W0, constLocalROR32 << 3);
+                else
+                    MOV(rdMapped, W0);
+            }
+            else if (flags & memop_SignExtend)
+            {
+                if (size == 16)
+                    SXTH(rdMapped, W0);
+                else if (size == 8)
+                    SXTB(rdMapped, W0);
+                else
+                    assert("What's wrong with you?");
+            }
+            else
+                MOV(rdMapped, W0);
+            
+            if (CurInstr.Info.Branches())
+            {
+                if (size < 32)
+                    printf("LDR size < 32 branching?\n");
+                Comp_JumpTo(rdMapped, Num == 0, false);
+            }
+        }
+    }
+}
+
+void Compiler::A_Comp_MemWB()
+{
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 25))
+        offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F);
+    else
+        offset = Op2(CurInstr.Instr & 0xFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags);
+}
+
+void Compiler::A_Comp_MemHD()
+{
+    bool load = CurInstr.Instr & (1 << 20);
+    bool signExtend;
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    int size;
+    
+    if (load)
+    {
+        signExtend = op >= 2;
+        size = op == 2 ? 8 : 16;
+    }
+    else
+    {
+        size = 16;
+        signExtend = false;
+    }
+
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 22))
+        offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0));
+    else
+        offset = Op2(MapReg(CurInstr.A_Reg(0)));
+    
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    bool byte = op & 0x1;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), 
+        Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))),
+        size, flags);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
+        load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_LoadPCRel()
+{
+    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+
+    if (Config::JIT_LiteralOptimisations)
+    {
+        Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
+        Comp_AddCycles_CDI();
+    }
+    else
+    {
+        bool negative = addr < R15;
+        u32 abs = negative ? R15 - addr : addr - R15;
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
+    }
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    IrregularCycles = true;
+
+    int regsCount = regs.Count();
+
+    if (regsCount == 0)
+        return 0; // actually not the right behaviour TODO: fix me
+
+    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+    if (store)
+    {
+        Comp_AddCycles_CD();
+
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W0, RCPSR, 0, 5);
+
+        int i = regsCount - 1;
+
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                    MOV(W3, MapReg(reg));
+                else
+                    LoadReg(reg, W3);
+                MOVI2R(W1, reg - 8);
+                BL(ReadBanked);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3;
+                ARM64Reg second = W4;
+
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                    first = MapReg(reg);
+                else
+                    LoadReg(reg, W3);
+
+                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                    second = MapReg(*nextReg);
+                else
+                    LoadReg(*nextReg, W4);
+                
+                STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+                i--;
+                it++;
+            }
+            else if (RegCache.Mapping[reg] != INVALID_REG)
+                STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+            else
+            {
+                LoadReg(reg, W3);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            i--;
+            it++;
+        }
+    }
+    if (decrement)
+    {
+        SUB(W0, MapReg(rn), regsCount * 4);
+        preinc ^= true;
+    }
+    else
+        MOV(W0, MapReg(rn));
+    ADD(X1, SP, 0);
+    MOVI2R(W2, regsCount);
+
+    BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+
+    if (!store)
+    {
+        Comp_AddCycles_CDI();
+
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W0, RCPSR, 0, 5);
+
+        int i = regsCount - 1;
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                MOVI2R(W1, reg - 8);
+                BL(WriteBanked);
+                FixupBranch alreadyWritten = CBNZ(W4);
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                {
+                    MOV(MapReg(reg), W3);
+                    RegCache.DirtyRegs |= 1 << reg;
+                }
+                else
+                    SaveReg(reg, W3);
+                SetJumpTarget(alreadyWritten);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3, second = W4;
+                
+                if (RegCache.Mapping[reg] != INVALID_REG)
+                {
+                    first = MapReg(reg);
+                    if (reg != 15)
+                        RegCache.DirtyRegs |= 1 << reg;
+                }
+                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                {
+                    second = MapReg(*nextReg);
+                    if (*nextReg != 15)
+                        RegCache.DirtyRegs |= 1 << *nextReg;
+                }
+                
+                LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+                if (first == W3)
+                    SaveReg(reg, W3);
+                if (second == W4)
+                    SaveReg(*nextReg, W4);
+
+                it++;
+                i--;
+            }
+            else if (RegCache.Mapping[reg] != INVALID_REG)
+            {
+                ARM64Reg mapped = MapReg(reg);
+                LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
+
+                if (reg != 15)
+                    RegCache.DirtyRegs |= 1 << reg;
+            }
+            else
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                SaveReg(reg, W3);
+            }
+
+            it++;
+            i--;
+        }
+    }
+    ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
+
+    if (!store && regs[15])
+    {
+        ARM64Reg mapped = MapReg(15);
+        Comp_JumpTo(mapped, Num == 0, usermode);
+    }
+
+    return regsCount * 4 * (decrement ? -1 : 1);
+}
+
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
+
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
+
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
+    if (writeback)
+    {
+        if (offset > 0)
+            ADD(rn, rn, offset);
+        else
+            SUB(rn, rn, -offset);
+    }
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    ARM64Reg sp = MapReg(13);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
+
+    if (offset > 0)
+            ADD(sp, sp, offset);
+        else
+            SUB(sp, sp, -offset);
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    ARM64Reg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+    u32 regsCount = regs.Count();
+    
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+    {
+        if (offset > 0)
+            ADD(rb, rb, offset);
+        else
+            SUB(rb, rb, -offset);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 08e2f0a..b884773 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -2,6 +2,8 @@
 
 #include <stdio.h>
 
+#include "Config.h"
+
 namespace ARMInstrInfo
 {
 
@@ -363,7 +365,11 @@ Info Decode(bool thumb, u32 num, u32 instr)
             res.SpecialKind = special_WriteMem;
         
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+        {
+            if (!Config::JIT_LiteralOptimisations)
+                res.SrcRegs |= 1 << 15;
             res.SpecialKind = special_LoadLiteral;
+        }
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
         {
@@ -417,7 +423,6 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 cp = ((instr >> 8) & 0xF);
             if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
             {
-                printf("happens\n");
                 data = A_UNK;
                 res.Kind = ak_UNK;
             }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bfc0ad9..fce9e49 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -60,10 +60,31 @@ if (ENABLE_JIT)
 		ARMJIT_x64/ARMJIT_Branch.cpp
 
 		dolphin/CommonFuncs.cpp
-		dolphin/x64ABI.cpp
-		dolphin/x64CPUDetect.cpp
-		dolphin/x64Emitter.cpp
 	)
+
+	if (ARCHITECTURE STREQUAL x86_64)
+		target_sources(core PRIVATE
+			dolphin/x64ABI.cpp
+			dolphin/x64CPUDetect.cpp
+			dolphin/x64Emitter.cpp
+
+			ARMJIT_x64/ARMJIT_Compiler.cpp
+			ARMJIT_x64/ARMJIT_ALU.cpp
+			ARMJIT_x64/ARMJIT_LoadStore.cpp
+			ARMJIT_x64/ARMJIT_Branch.cpp
+		)
+	endif()
+	if (ARCHITECTURE STREQUAL ARM64)
+		target_sources(core PRIVATE
+			dolphin/Arm64Emitter.cpp
+			dolphin/MathUtil.cpp
+
+			ARMJIT_A64/ARMJIT_Compiler.cpp
+			ARMJIT_A64/ARMJIT_ALU.cpp
+			ARMJIT_A64/ARMJIT_LoadStore.cpp
+			ARMJIT_A64/ARMJIT_Branch.cpp
+		)
+	endif()
 endif()
 
 if (WIN32)
diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h
new file mode 100644
index 0000000..40c4576
--- /dev/null
+++ b/src/dolphin/Align.h
@@ -0,0 +1,24 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace Common
+{
+template <typename T>
+constexpr T AlignUp(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value + (size - value % size) % size);
+}
+
+template <typename T>
+constexpr T AlignDown(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value - value % size);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
new file mode 100644
index 0000000..dbcf425
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -0,0 +1,4466 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#include "Align.h"
+#include "Arm64Emitter.h"
+#include "Assert.h"
+#include "BitUtils.h"
+#include "../types.h"
+#include "MathUtil.h"
+
+namespace Arm64Gen
+{
+namespace
+{
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width)
+{
+  // TODO(jbramley): Optimize this for ARM64 hosts.
+  int count = 0;
+  uint64_t bit_test = 1ULL << (width - 1);
+  while ((count < width) && ((bit_test & value) == 0))
+  {
+    count++;
+    bit_test >>= 1;
+  }
+  return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value)
+{
+  return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift)
+{
+  if (input < 4096)
+  {
+    *val = input;
+    *shift = false;
+    return true;
+  }
+  else if ((input & 0xFFF000) == input)
+  {
+    *val = input >> 12;
+    *shift = true;
+    return true;
+  }
+  return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+                  unsigned int* imm_r)
+{
+  // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+  // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+  bool negate = false;
+
+  // Logical immediates are encoded using parameters n, imm_s and imm_r using
+  // the following table:
+  //
+  //    N   imms    immr    size        S             R
+  //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+  //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+  //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+  //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+  //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+  //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+  // (s bits must not be all set)
+  //
+  // A pattern is constructed of size bits, where the least significant S+1 bits
+  // are set. The pattern is rotated right by R, and repeated across a 32 or
+  // 64-bit value, depending on destination register width.
+  //
+  // Put another way: the basic format of a logical immediate is a single
+  // contiguous stretch of 1 bits, repeated across the whole word at intervals
+  // given by a power of 2. To identify them quickly, we first locate the
+  // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+  // is different for every logical immediate, so it gives us all the
+  // information we need to identify the only logical immediate that our input
+  // could be, and then we simply check if that's the value we actually have.
+  //
+  // (The rotation parameter does give the possibility of the stretch of 1 bits
+  // going 'round the end' of the word. To deal with that, we observe that in
+  // any situation where that happens the bitwise NOT of the value is also a
+  // valid logical immediate. So we simply invert the input whenever its low bit
+  // is set, and then we know that the rotated case can't arise.)
+
+  if (value & 1)
+  {
+    // If the low bit is 1, negate the value, and set a flag to remember that we
+    // did (so that we can adjust the return values appropriately).
+    negate = true;
+    value = ~value;
+  }
+
+  if (width == kWRegSizeInBits)
+  {
+    // To handle 32-bit logical immediates, the very easiest thing is to repeat
+    // the input value twice to make a 64-bit word. The correct encoding of that
+    // as a logical immediate will also be the correct encoding of the 32-bit
+    // value.
+
+    // The most-significant 32 bits may not be zero (ie. negate is true) so
+    // shift the value left before duplicating it.
+    value <<= kWRegSizeInBits;
+    value |= value >> kWRegSizeInBits;
+  }
+
+  // The basic analysis idea: imagine our input word looks like this.
+  //
+  //    0011111000111110001111100011111000111110001111100011111000111110
+  //                                                          c  b    a
+  //                                                          |<--d-->|
+  //
+  // We find the lowest set bit (as an actual power-of-2 value, not its index)
+  // and call it a. Then we add a to our original number, which wipes out the
+  // bottommost stretch of set bits and replaces it with a 1 carried into the
+  // next zero bit. Then we look for the new lowest set bit, which is in
+  // position b, and subtract it, so now our number is just like the original
+  // but with the lowest stretch of set bits completely gone. Now we find the
+  // lowest set bit again, which is position c in the diagram above. Then we'll
+  // measure the distance d between bit positions a and c (using CLZ), and that
+  // tells us that the only valid logical immediate that could possibly be equal
+  // to this number is the one in which a stretch of bits running from a to just
+  // below b is replicated every d bits.
+  uint64_t a = LargestPowerOf2Divisor(value);
+  uint64_t value_plus_a = value + a;
+  uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+  uint64_t value_plus_a_minus_b = value_plus_a - b;
+  uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+  int d, clz_a, out_n;
+  uint64_t mask;
+
+  if (c != 0)
+  {
+    // The general case, in which there is more than one stretch of set bits.
+    // Compute the repeat distance d, and set up a bitmask covering the basic
+    // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+    // of these cases the N bit of the output will be zero.
+    clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+    int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+    d = clz_a - clz_c;
+    mask = ((UINT64_C(1) << d) - 1);
+    out_n = 0;
+  }
+  else
+  {
+    // Handle degenerate cases.
+    //
+    // If any of those 'find lowest set bit' operations didn't find a set bit at
+    // all, then the word will have been zero thereafter, so in particular the
+    // last lowest_set_bit operation will have returned zero. So we can test for
+    // all the special case conditions in one go by seeing if c is zero.
+    if (a == 0)
+    {
+      // The input was zero (or all 1 bits, which will come to here too after we
+      // inverted it at the start of the function), for which we just return
+      // false.
+      return false;
+    }
+    else
+    {
+      // Otherwise, if c was zero but a was not, then there's just one stretch
+      // of set bits in our word, meaning that we have the trivial case of
+      // d == 64 and only one 'repetition'. Set up all the same variables as in
+      // the general case above, and set the N bit in the output.
+      clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+      d = 64;
+      mask = ~UINT64_C(0);
+      out_n = 1;
+    }
+  }
+
+  // If the repeat period d is not a power of two, it can't be encoded.
+  if (!MathUtil::IsPow2<u64>(d))
+    return false;
+
+  // If the bit stretch (b - a) does not fit within the mask derived from the
+  // repeat period, then fail.
+  if (((b - a) & ~mask) != 0)
+    return false;
+
+  // The only possible option is b - a repeated every d bits. Now we're going to
+  // actually construct the valid logical immediate derived from that
+  // specification, and see if it equals our original input.
+  //
+  // To repeat a value every d bits, we multiply it by a number of the form
+  // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+  // be derived using a table lookup on CLZ(d).
+  static const std::array<uint64_t, 6> multipliers = {{
+      0x0000000000000001UL,
+      0x0000000100000001UL,
+      0x0001000100010001UL,
+      0x0101010101010101UL,
+      0x1111111111111111UL,
+      0x5555555555555555UL,
+  }};
+
+  int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+  // Ensure that the index to the multipliers array is within bounds.
+  DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+  uint64_t multiplier = multipliers[multiplier_idx];
+  uint64_t candidate = (b - a) * multiplier;
+
+  // The candidate pattern doesn't match our input value, so fail.
+  if (value != candidate)
+    return false;
+
+  // We have a match! This is a valid logical immediate, so now we have to
+  // construct the bits and pieces of the instruction encoding that generates
+  // it.
+
+  // Count the set bits in our basic stretch. The special case of clz(0) == -1
+  // makes the answer come out right for stretches that reach the very top of
+  // the word (e.g. numbers like 0xffffc00000000000).
+  int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+  int s = clz_a - clz_b;
+
+  // Decide how many bits to rotate right by, to put the low bit of that basic
+  // stretch in position a.
+  int r;
+  if (negate)
+  {
+    // If we inverted the input right at the start of this function, here's
+    // where we compensate: the number of set bits becomes the number of clear
+    // bits, and the rotation count is based on position b rather than position
+    // a (since b is the location of the 'lowest' 1 bit after inversion).
+    s = d - s;
+    r = (clz_b + 1) & (d - 1);
+  }
+  else
+  {
+    r = (clz_a + 1) & (d - 1);
+  }
+
+  // Now we're done, except for having to encode the S output in such a way that
+  // it gives both the number of set bits and the length of the repeated
+  // segment. The s field is encoded like this:
+  //
+  //     imms    size        S
+  //    ssssss    64    UInt(ssssss)
+  //    0sssss    32    UInt(sssss)
+  //    10ssss    16    UInt(ssss)
+  //    110sss     8    UInt(sss)
+  //    1110ss     4    UInt(ss)
+  //    11110s     2    UInt(s)
+  //
+  // So we 'or' (-d << 1) with our computed s to form imms.
+  *n = out_n;
+  *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+  *imm_r = r;
+
+  return true;
+}
+
+float FPImm8ToFloat(u8 bits)
+{
+  const u32 sign = bits >> 7;
+  const u32 bit6 = (bits >> 6) & 1;
+  const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+  const u32 mantissa = (bits & 0xF) << 19;
+  const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+  return Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out)
+{
+  const u32 f = Common::BitCast<u32>(value);
+  const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+  const u32 exponent = (f >> 23) & 0xFF;
+  const u32 sign = f >> 31;
+
+  if ((exponent >> 7) == ((exponent >> 6) & 1))
+    return false;
+
+  const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4;
+  const float new_float = FPImm8ToFloat(imm8);
+  if (new_float == value)
+    *imm_out = imm8;
+  else
+    return false;
+
+  return true;
+}
+}  // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr)
+{
+  m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr)
+{
+  SetCodePtrUnsafe(ptr);
+  m_lastCacheFlushEnd = ptr;
+}
+
+void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase)
+{
+  m_code = 0;
+  m_lastCacheFlushEnd = 0;
+  m_rwbase = rwbase;
+  m_rxbase = rxbase;
+}
+
+ptrdiff_t ARM64XEmitter::GetCodeOffset()
+{
+  return m_code;
+}
+
+const u8* ARM64XEmitter::GetRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+u8* ARM64XEmitter::GetWriteableRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+void* ARM64XEmitter::GetRXPtr()
+{
+  return m_rxbase + m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes)
+{
+  for (u32 i = 0; i < bytes / 4; i++)
+    BRK(0);
+}
+
+ptrdiff_t ARM64XEmitter::AlignCode16()
+{
+  int c = int((u64)m_code & 15);
+  if (c)
+    ReserveCodeSpace(16 - c);
+  return m_code;
+}
+
+ptrdiff_t ARM64XEmitter::AlignCodePage()
+{
+  int c = int((u64)m_code & 4095);
+  if (c)
+    ReserveCodeSpace(4096 - c);
+  return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value)
+{
+  std::memcpy(m_rwbase + m_code, &value, sizeof(u32));
+  m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache()
+{
+  FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code);
+  m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
+{
+  if (start == end)
+    return;
+
+#if defined(IOS)
+  // Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+  sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+  // Don't rely on GCC's __clear_cache implementation, as it caches
+  // icache/dcache cache line sizes, that can vary between cores on
+  // big.LITTLE architectures.
+  u64 addr, ctr_el0;
+  static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+  size_t isize, dsize;
+
+  __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+  isize = 4 << ((ctr_el0 >> 0) & 0xf);
+  dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+  // use the global minimum cache line size
+  icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+  dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+  addr = (u64)start & ~(u64)(dsize - 1);
+  for (; addr < (u64)end; addr += dsize)
+    // use "civac" instead of "cvau", as this is the suggested workaround for
+    // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+    __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+  __asm__ volatile("dsb ish" : : : "memory");
+
+  addr = (u64)start & ~(u64)(isize - 1);
+  for (; addr < (u64)end; addr += isize)
+    __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+  __asm__ volatile("dsb ish" : : : "memory");
+  __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+    {0, 0, 1},  // SVC
+    {0, 0, 2},  // HVC
+    {0, 0, 3},  // SMC
+    {1, 0, 0},  // BRK
+    {2, 0, 0},  // HLT
+    {5, 0, 1},  // DCPS1
+    {5, 0, 2},  // DCPS2
+    {5, 0, 3},  // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+    0x058,  // ADD
+    0x258,  // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+    {0, 0},  // CSEL
+    {0, 1},  // CSINC
+    {1, 0},  // CSINV
+    {1, 1},  // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+    {0, 0},  // RBIT
+    {0, 1},  // REV16
+    {0, 2},  // REV32
+    {0, 3},  // REV64
+    {0, 4},  // CLZ
+    {0, 5},  // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+    0x02,  // UDIV
+    0x03,  // SDIV
+    0x08,  // LSLV
+    0x09,  // LSRV
+    0x0A,  // ASRV
+    0x0B,  // RORV
+    0x10,  // CRC32B
+    0x11,  // CRC32H
+    0x12,  // CRC32W
+    0x14,  // CRC32CB
+    0x15,  // CRC32CH
+    0x16,  // CRC32CW
+    0x13,  // CRC32X (64bit Only)
+    0x17,  // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+    {0, 0},  // MADD
+    {0, 1},  // MSUB
+    {1, 0},  // SMADDL (64Bit Only)
+    {1, 1},  // SMSUBL (64Bit Only)
+    {2, 0},  // SMULH (64Bit Only)
+    {5, 0},  // UMADDL (64Bit Only)
+    {5, 1},  // UMSUBL (64Bit Only)
+    {6, 0},  // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+    {0, 0},  // AND
+    {0, 1},  // BIC
+    {1, 0},  // OOR
+    {1, 1},  // ORN
+    {2, 0},  // EOR
+    {2, 1},  // EON
+    {3, 0},  // ANDS
+    {3, 1},  // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+    {0, 0, 0, 0, 0},  // STXRB
+    {0, 0, 0, 0, 1},  // STLXRB
+    {0, 0, 1, 0, 0},  // LDXRB
+    {0, 0, 1, 0, 1},  // LDAXRB
+    {0, 1, 0, 0, 1},  // STLRB
+    {0, 1, 1, 0, 1},  // LDARB
+    {1, 0, 0, 0, 0},  // STXRH
+    {1, 0, 0, 0, 1},  // STLXRH
+    {1, 0, 1, 0, 0},  // LDXRH
+    {1, 0, 1, 0, 1},  // LDAXRH
+    {1, 1, 0, 0, 1},  // STLRH
+    {1, 1, 1, 0, 1},  // LDARH
+    {2, 0, 0, 0, 0},  // STXR
+    {3, 0, 0, 0, 0},  // (64bit) STXR
+    {2, 0, 0, 0, 1},  // STLXR
+    {3, 0, 0, 0, 1},  // (64bit) STLXR
+    {2, 0, 0, 1, 0},  // STXP
+    {3, 0, 0, 1, 0},  // (64bit) STXP
+    {2, 0, 0, 1, 1},  // STLXP
+    {3, 0, 0, 1, 1},  // (64bit) STLXP
+    {2, 0, 1, 0, 0},  // LDXR
+    {3, 0, 1, 0, 0},  // (64bit) LDXR
+    {2, 0, 1, 0, 1},  // LDAXR
+    {3, 0, 1, 0, 1},  // (64bit) LDAXR
+    {2, 0, 1, 1, 0},  // LDXP
+    {3, 0, 1, 1, 0},  // (64bit) LDXP
+    {2, 0, 1, 1, 1},  // LDAXP
+    {3, 0, 1, 1, 1},  // (64bit) LDAXP
+    {2, 1, 0, 0, 1},  // STLR
+    {3, 1, 0, 0, 1},  // (64bit) STLR
+    {2, 1, 1, 0, 1},  // LDAR
+    {3, 1, 1, 0, 1},  // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+          (((u32)distance << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr)
+{
+  s64 distance = (s64)ptr - s64(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn)
+{
+  Rn = DecodeReg(Rn);
+  Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d",
+             __func__, imm);
+
+  Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+          ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt)
+{
+  Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                         ARM64Reg Rm, ArithOption Option)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+          (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+          Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                              ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rn);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+          (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+                                             CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rm);
+
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+          (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                         CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+          (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+          (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+          Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                       ARM64Reg Ra)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Ra = DecodeReg(Ra);
+  Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+          (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                      ArithOption Shift)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+          (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  if (b64Bit && bitop != 0x2)  // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+    bitop |= 0x1;
+  Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+                                           ARM64Reg Rt)
+{
+  Rs = DecodeReg(Rs);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Rt = DecodeReg(Rt);
+  Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) |
+          (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) |
+          (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                                              u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool b128Bit = IsQuad(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (b128Bit)
+    imm >>= 4;
+  else if (b64Bit)
+    imm >>= 3;
+  else
+    imm >>= 2;
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+  u32 opc = 0;
+  if (b128Bit)
+    opc = 2;
+  else if (b64Bit && bVec)
+    opc = 1;
+  else if (b64Bit && !bVec)
+    opc = 2;
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  u32 offset = imm & 0x1FF;
+
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (size == 64)
+    imm >>= 3;
+  else if (size == 32)
+    imm >>= 2;
+  else if (size == 16)
+    imm >>= 1;
+
+  ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+          (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+                                                  ArithOption Rm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+          (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+                                        ARM64Reg Rd)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+                                         int n)
+{
+  // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+  // Use Rn to determine bitness here.
+  bool b64Bit = Is64Bit(Rn);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                                        ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  u32 type_encode = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (b64Bit)
+  {
+    op |= 0b10;
+    imm >>= 3;
+  }
+  else
+  {
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
+{
+  Rd = DecodeReg(Rd);
+
+  Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+static constexpr bool IsInRangeImm19(s64 distance)
+{
+  return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance)
+{
+  return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance)
+{
+  return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance)
+{
+  return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance)
+{
+  return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance)
+{
+  return distance & 0x3FFFFFF;
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
+{
+  bool Not = false;
+  u32 inst = 0;
+  s64 distance = (s64)(m_code - branch.ptr);
+  distance >>= 2;
+
+  switch (branch.type)
+  {
+  case 1:  // CBNZ
+    Not = true;
+  case 0:  // CBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    bool b64Bit = Is64Bit(branch.reg);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+  }
+  break;
+  case 2:  // B (conditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+    break;
+  case 4:  // TBNZ
+    Not = true;
+  case 3:  // TBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) |
+           (MaskImm14(distance) << 5) | reg;
+  }
+  break;
+  case 5:  // B (uncoditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x5 << 26) | MaskImm26(distance);
+    break;
+  case 6:  // BL (unconditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x25 << 26) | MaskImm26(distance);
+    break;
+  }
+
+  std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 0;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 1;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 2;
+  branch.cond = cond;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 3;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 4;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 5;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::BL()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 6;
+  HINT(HINT_NOP);
+  return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr)
+{
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance),
+             "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr,
+             distance, distance);
+  Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BLR(scratchreg);
+  }
+  else
+  {
+    BL(func);
+  }
+}
+
+void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BR(scratchreg);
+  }
+  else
+  {
+    B(func);
+  }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET()
+{
+  EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS()
+{
+  EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm)
+{
+  EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm)
+{
+  EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm)
+{
+  EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm)
+{
+  EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm)
+{
+  EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm)
+{
+  EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm)
+{
+  EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm)
+{
+  EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm)
+{
+  u32 op1 = 0, op2 = 0;
+  switch (field)
+  {
+  case FIELD_SPSel:
+    op1 = 0;
+    op2 = 5;
+    break;
+  case FIELD_DAIFSet:
+    op1 = 3;
+    op2 = 6;
+    break;
+  case FIELD_DAIFClr:
+    op1 = 3;
+    op2 = 7;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to");
+    break;
+  }
+  EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2)
+{
+  switch (field)
+  {
+  case FIELD_NZCV:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 2;
+    op2 = 0;
+    break;
+  case FIELD_FPCR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 0;
+    break;
+  case FIELD_FPSR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 1;
+    break;
+  case FIELD_PMCR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 6;
+    op2 = 0;
+    break;
+  case FIELD_PMCCNTR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 7;
+    op2 = 0;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to");
+    break;
+  }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+  // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+  EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op)
+{
+  EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX()
+{
+  EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift)
+{
+  ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm)
+{
+  if (IsGPR(Rd) && IsGPR(Rm))
+    ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+  else
+    ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm)
+{
+  ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift)
+{
+  bool sf = Is64Bit(Rd);
+  bool N = sf;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+  SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+  else
+    EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+  EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (Rt, Rn, Rm); break;
+    case 33: LDRSW(Rt, Rn, Rm); break;
+    case 16: LDRH (Rt, Rn, Rm); break;
+    case 17: LDRSH(Rt, Rn, Rm); break;
+    case 8:  LDRB (Rt, Rn, Rm); break;
+    case 9:  LDRSB(Rt, Rn, Rm); break;
+    default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size)
+  {
+    case 32: STR  (Rt, Rn, Rm); break;
+    case 16: STRH (Rt, Rn, Rm); break;
+    case 8:  STRB (Rt, Rn, Rm); break;
+    default: PanicAlert("STRGeneric(reg): invalid size %d", size); break;
+  }
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (type, Rt, Rn, imm); break;
+    case 33: LDRSW(type, Rt, Rn, imm); break;
+    case 16: LDRH (type, Rt, Rn, imm); break;
+    case 17: LDRSH(type, Rt, Rn, imm); break;
+    case 8:  LDRB (type, Rt, Rn, imm); break;
+    case 9:  LDRSB(type, Rt, Rn, imm); break;
+    default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size)
+  {
+    case 32: STR  (type, Rt, Rn, imm); break;
+    case 16: STRH (type, Rt, Rn, imm); break;
+    case 8:  STRB (type, Rt, Rn, imm); break;
+    default: PanicAlert("STRGeneric(imm): invalid size %d", size); break;
+  }
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
+{
+  unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+  BitSet32 upload_part(0);
+
+  // Always start with a movz! Kills the dependency on the register.
+  bool use_movz = true;
+
+  if (!imm)
+  {
+    // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
+    // clearer in disasm too.
+    MOVZ(Rd, 0, SHIFT_0);
+    return;
+  }
+
+  if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+      (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
+  {
+    // Max unsigned value (or if signed, -1)
+    // Set to ~ZR
+    ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+    ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+    return;
+  }
+
+  // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
+  // Small negative integer. Use MOVN
+  if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
+  {
+    MOVN(Rd, ~imm, SHIFT_0);
+    return;
+  }
+
+  // XXX: Use MOVN when possible.
+  // XXX: Optimize more
+  // XXX: Support rotating immediates to save instructions
+  if (optimize)
+  {
+    for (unsigned int i = 0; i < parts; ++i)
+    {
+      if ((imm >> (i * 16)) & 0xFFFF)
+        upload_part[i] = 1;
+    }
+  }
+
+  u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF;
+s64 aligned_offset = (s64)imm - (s64)aligned_pc;
+  // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
+  if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
+  {
+    // Immediate we are loading is within 4GB of our aligned range
+    // Most likely a address that we can load in one or two instructions
+    if (!(std::abs(aligned_offset) & 0xFFF))
+    {
+      // Aligned ADR
+      ADRP(Rd, (s32)aligned_offset);
+      return;
+    }
+    else
+    {
+      // If the address is within 1MB of PC we can load it in a single instruction still
+      s64 offset = (s64)imm - (s64)(m_rxbase + m_code);
+      if (offset >= -0xFFFFF && offset <= 0xFFFFF)
+      {
+        ADR(Rd, (s32)offset);
+        return;
+      }
+      else
+      {
+        ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
+        ADD(Rd, Rd, imm & 0xFFF);
+        return;
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < parts; ++i)
+  {
+    if (use_movz && upload_part[i])
+    {
+      MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+      use_movz = false;
+    }
+    else
+    {
+      if (upload_part[i] || !optimize)
+        MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+    }
+  }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
+{
+  // TODO: Also optimize for performance, not just for code size.
+  ptrdiff_t start_offset = GetCodeOffset();
+
+  MOVI2R(Rd, imm1);
+  int size1 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  MOVI2R(Rd, imm2);
+  int size2 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  bool element = size1 > size2;
+
+  MOVI2R(Rd, element ? imm2 : imm1);
+
+  return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last write to avoid the dependency between those stores.
+
+  // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+  if (num_regs & 1)
+  {
+    STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size);
+  }
+  else
+  {
+    ARM64Reg first_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg second_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size);
+  }
+
+  // Fast store for all other registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // We must adjust the SP in the end, so load the first (two) registers at least.
+  ARM64Reg first = (ARM64Reg)(X0 + *it++);
+  ARM64Reg second;
+  if (!(num_regs & 1))
+    second = (ARM64Reg)(X0 + *it++);
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last load to avoid the dependency between those loads.
+
+  // Fast load for all but the first (two) registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  // Post loading the first (two) registers.
+  if (num_regs & 1)
+    LDR(INDEX_POST, first, SP, stack_size);
+  else
+    LDP(INDEX_POST, first, second, SP, stack_size);
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+                                               ARM64Reg Rn, s32 imm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  u32 encoded_size = 0;
+  u32 encoded_imm = 0;
+
+  if (size == 8)
+    encoded_size = 0;
+  else if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+  else if (size == 128)
+    encoded_size = 0;
+
+  if (type == INDEX_UNSIGNED)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)),
+               "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__,
+               imm, m_emit->GetCodePtr());
+    ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!",
+               __func__);
+    if (size == 16)
+      imm >>= 1;
+    else if (size == 32)
+      imm >>= 2;
+    else if (size == 64)
+      imm >>= 3;
+    else if (size == 128)
+      imm >>= 4;
+    encoded_imm = (imm & 0xFFF);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255),
+               "%s immediate offset must be within range of -256 to 256!", __func__);
+    encoded_imm = (imm & 0x1FF) << 2;
+    if (type == INDEX_POST)
+      encoded_imm |= 1;
+    else
+      encoded_imm |= 3;
+  }
+
+  Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+          (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) |
+          (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                      ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rd);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+          (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+          (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+          (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+                                       ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+                                               bool sign)
+{
+  DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point");
+  if (IsGPR(Rd))
+  {
+    // Use the encoding that transfers the result to a GPR.
+    bool sf = Is64Bit(Rd);
+    int type = IsDouble(Rn) ? 1 : 0;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = (sign ? 1 : 0);
+    int rmode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      rmode = 0;
+      opcode |= 4;
+      break;
+    case ROUND_P:
+      rmode = 1;
+      break;
+    case ROUND_M:
+      rmode = 2;
+      break;
+    case ROUND_Z:
+      rmode = 3;
+      break;
+    case ROUND_N:
+      rmode = 0;
+      break;
+    }
+    EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+  }
+  else
+  {
+    // Use the encoding (vector, single) that keeps the result in the fp register.
+    int sz = IsDouble(Rn);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      opcode = 0x1C;
+      break;
+    case ROUND_N:
+      opcode = 0x1A;
+      break;
+    case ROUND_M:
+      opcode = 0x1B;
+      break;
+    case ROUND_P:
+      opcode = 0x1A;
+      sz |= 2;
+      break;
+    case ROUND_Z:
+      opcode = 0x1B;
+      sz |= 2;
+      break;
+    }
+    Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+            (Rn << 5) | Rd);
+  }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+                                        u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) |
+          (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rn);
+
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+          (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+                                       ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+          (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+  bool quad = IsQuad(Rd);
+
+  u32 encoded_size = 0;
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  bool is_double = !IsSingle(Rd);
+
+  Rd = DecodeReg(Rd);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+          (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                     ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                           ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+                                                       ARM64Reg Rn)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+                                                           ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+          (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+                                           ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+          (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+                                            ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  u32 type_encode = 0;
+  u32 opc = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (size == 128)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 2;
+    imm >>= 4;
+  }
+  else if (size == 64)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 1;
+    imm >>= 3;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 0;
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+                                                      ArithOption Rm)
+{
+  ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+             "%s must contain an extended reg as Rm!", __func__);
+
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  if (load)
+    encoded_op |= 1;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+          Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh)
+{
+  union
+  {
+    u8 hex;
+    struct
+    {
+      unsigned defgh : 5;
+      unsigned abc : 3;
+    };
+  } v;
+  v.hex = abcdefgh;
+  Rd = DecodeReg(Rd);
+  Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) |
+          (1 << 10) | (v.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 1;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 1;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 1;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 1;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 3;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
+{
+  if (IsScalar(Rd) && IsScalar(Rn))
+  {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+    int rmode = 0;
+    int opcode = 6;
+    int sf = 0;
+    if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
+    {
+      // GPR to scalar single
+      opcode |= 1;
+    }
+    else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
+    {
+      // Scalar single to GPR - defaults are correct
+    }
+    else
+    {
+      // TODO
+      ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
+    }
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+  }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                          ARM64Reg Ra, int opcode)
+{
+  int type = isDouble ? 1 : 0;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Ra = DecodeReg(Ra);
+  int o1 = opcode >> 1;
+  int o0 = opcode & 1;
+  m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) |
+                  (Rn << 5) | Rd);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
+{
+  EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+    imm5 = 1;
+  else if (size == 16)
+    imm5 = 2;
+  else if (size == 32)
+    imm5 = 4;
+  else if (size == 64)
+    imm5 = 8;
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2)
+{
+  u32 imm5 = 0, imm4 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index1 << 1;
+    imm4 = index2;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index1 << 2;
+    imm4 = index2 << 1;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index1 << 3;
+    imm4 = index2 << 2;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index1 << 4;
+    imm4 = index2 << 3;
+  }
+
+  EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64),
+             "%s must have a size of 64 when destination is 64bit!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 dst_encoding = 0;
+  u32 src_encoding = 0;
+
+  if (size_to == 16)
+    dst_encoding = 3;
+  else if (size_to == 32)
+    dst_encoding = 0;
+  else if (size_to == 64)
+    dst_encoding = 1;
+
+  if (size_from == 16)
+    src_encoding = 3;
+  else if (size_from == 32)
+    src_encoding = 0;
+  else if (size_from == 64)
+    src_encoding = 1;
+
+  Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = false;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+    EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = true;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+
+    EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (dest_size == 8)
+  {
+    immh = 1;
+  }
+  else if (dest_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (dest_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 0;
+  u8 op = 0;
+  u8 abcdefgh = imm & 0xFF;
+  if (size == 8)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+  }
+  else if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else  // 64
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+    op = 1;
+    cmode = 0xE;
+    abcdefgh = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+      u8 tmp = (imm >> (i << 3)) & 0xFF;
+      ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+      if (tmp == 0xFF)
+        abcdefgh |= (1 << i);
+    }
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 1;
+  u8 op = 1;
+  if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__);
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    int num_regs = registers.Count();
+    m_emit->SUB(SP, SP, num_regs * 16);
+    m_emit->ADD(tmp, SP, 0);
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+
+      // 0 = true
+      // 1 < 4 && registers[i + 1] true!
+      // 2 < 4 && registers[i + 2] true!
+      // 3 < 4 && registers[i + 3] true!
+      // 4 < 4 && registers[i + 4] false!
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+  }
+  else
+  {
+    std::vector<ARM64Reg> pair_regs;
+    for (auto it : registers)
+    {
+      pair_regs.push_back((ARM64Reg)(Q0 + it));
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+  }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+  int num_regs = registers.Count();
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    // The temporary register is only used to indicate that we can use this code path
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+  }
+  else
+  {
+    bool odd = num_regs % 2;
+    std::vector<ARM64Reg> pair_regs;
+    for (int i = 31; i >= 0; --i)
+    {
+      if (!registers[i])
+        continue;
+
+      if (odd)
+      {
+        // First load must be a regular LDR if odd
+        odd = false;
+        LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+      }
+      else
+      {
+        pair_regs.push_back((ARM64Reg)(Q0 + i));
+        if (pair_regs.size() == 2)
+        {
+          LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+          pair_regs.clear();
+        }
+      }
+    }
+  }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (!Is64Bit(Rn))
+    imm &= 0xFFFFFFFF;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    AND(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ORRI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ORR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "EORI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    EOR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDSI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ANDS(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+                                 bool flags)
+{
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, imm, shift);
+    break;
+  case 1:
+    ADDS(Rd, Rn, imm, shift);
+    break;
+  case 2:
+    SUB(Rd, Rn, imm, shift);
+    break;
+  case 3:
+    SUBS(Rd, Rn, imm, shift);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                                    ARM64Reg scratch)
+{
+  bool has_scratch = scratch != INVALID_REG;
+  u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
+  bool neg_neg = negative ? false : true;
+
+  // Fast paths, aarch64 immediate instructions
+  // Try them all first
+  if (imm <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm, false, negative, flags);
+    return;
+  }
+  if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+    return;
+  }
+
+  // ADD+ADD is slower than MOVK+ADD, but inplace.
+  // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+  // As this splits the addition in two parts, this must not be done on setting flags.
+  if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+    AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+    return;
+  }
+  if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+    AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+    return;
+  }
+
+  ASSERT_MSG(DYNA_REC, has_scratch,
+             "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch",
+             (u32)imm);
+
+  negative ^= MOVI2R2(scratch, imm, imm_neg);
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, scratch);
+    break;
+  case 1:
+    ADDS(Rd, Rn, scratch);
+    break;
+  case 2:
+    SUB(Rd, Rn, scratch);
+    break;
+  case 3:
+    SUBS(Rd, Rn, scratch);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    ADD(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    SUB(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    CMP(Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate)
+{
+  ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision");
+  uint8_t imm8;
+  if (value == 0.0)
+  {
+    FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+    if (negate)
+      FNEG(Rd, Rd);
+    // TODO: There are some other values we could generate with the float-imm instruction, like
+    // 1.0...
+  }
+  else if (FPImm8FromFloat(value, &imm8))
+  {
+    FMOV(Rd, imm8);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "Failed to find a way to generate FP immediate %f without scratch", value);
+    if (negate)
+      value = -value;
+
+    const u32 ival = Common::BitCast<u32>(value);
+    m_emit->MOVI2R(scratch, ival);
+    FMOV(Rd, scratch);
+  }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch)
+{
+  // TODO: Make it work with more element sizes
+  // TODO: Optimize - there are shorter solution for many values
+  ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
+  MOVI2F(s, value, scratch);
+  DUP(32, Rd, Rd, 0);
+}
+
+}  // namespace Arm64Gen
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
new file mode 100644
index 0000000..4cb9ff7
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.h
@@ -0,0 +1,1152 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+
+#include "ArmCommon.h"
+#include "Assert.h"
+#include "BitSet.h"
+#include "Compat.h"
+
+namespace Arm64Gen
+{
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg
+{
+  // 32bit registers
+  W0 = 0,
+  W1,
+  W2,
+  W3,
+  W4,
+  W5,
+  W6,
+  W7,
+  W8,
+  W9,
+  W10,
+  W11,
+  W12,
+  W13,
+  W14,
+  W15,
+  W16,
+  W17,
+  W18,
+  W19,
+  W20,
+  W21,
+  W22,
+  W23,
+  W24,
+  W25,
+  W26,
+  W27,
+  W28,
+  W29,
+  W30,
+
+  WSP,  // 32bit stack pointer
+
+  // 64bit registers
+  X0 = 0x20,
+  X1,
+  X2,
+  X3,
+  X4,
+  X5,
+  X6,
+  X7,
+  X8,
+  X9,
+  X10,
+  X11,
+  X12,
+  X13,
+  X14,
+  X15,
+  X16,
+  X17,
+  X18,
+  X19,
+  X20,
+  X21,
+  X22,
+  X23,
+  X24,
+  X25,
+  X26,
+  X27,
+  X28,
+  X29,
+  X30,
+
+  SP,  // 64bit stack pointer
+
+  // VFP single precision registers
+  S0 = 0x40,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6,
+  S7,
+  S8,
+  S9,
+  S10,
+  S11,
+  S12,
+  S13,
+  S14,
+  S15,
+  S16,
+  S17,
+  S18,
+  S19,
+  S20,
+  S21,
+  S22,
+  S23,
+  S24,
+  S25,
+  S26,
+  S27,
+  S28,
+  S29,
+  S30,
+  S31,
+
+  // VFP Double Precision registers
+  D0 = 0x80,
+  D1,
+  D2,
+  D3,
+  D4,
+  D5,
+  D6,
+  D7,
+  D8,
+  D9,
+  D10,
+  D11,
+  D12,
+  D13,
+  D14,
+  D15,
+  D16,
+  D17,
+  D18,
+  D19,
+  D20,
+  D21,
+  D22,
+  D23,
+  D24,
+  D25,
+  D26,
+  D27,
+  D28,
+  D29,
+  D30,
+  D31,
+
+  // ASIMD Quad-Word registers
+  Q0 = 0xC0,
+  Q1,
+  Q2,
+  Q3,
+  Q4,
+  Q5,
+  Q6,
+  Q7,
+  Q8,
+  Q9,
+  Q10,
+  Q11,
+  Q12,
+  Q13,
+  Q14,
+  Q15,
+  Q16,
+  Q17,
+  Q18,
+  Q19,
+  Q20,
+  Q21,
+  Q22,
+  Q23,
+  Q24,
+  Q25,
+  Q26,
+  Q27,
+  Q28,
+  Q29,
+  Q30,
+  Q31,
+
+  // For PRFM(prefetch memory) encoding
+  // This is encoded in the Rt register
+  // Data preload
+  PLDL1KEEP = 0,
+  PLDL1STRM,
+  PLDL2KEEP,
+  PLDL2STRM,
+  PLDL3KEEP,
+  PLDL3STRM,
+  // Instruction preload
+  PLIL1KEEP = 8,
+  PLIL1STRM,
+  PLIL2KEEP,
+  PLIL2STRM,
+  PLIL3KEEP,
+  PLIL3STRM,
+  // Prepare for store
+  PLTL1KEEP = 16,
+  PLTL1STRM,
+  PLTL2KEEP,
+  PLTL2STRM,
+  PLTL3KEEP,
+  PLTL3STRM,
+
+  WZR = WSP,
+  ZR = SP,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg)
+{
+  return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg)
+{
+  return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg)
+{
+  return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg)
+{
+  return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType
+{
+  TYPE_IMM = 0,
+  TYPE_REG,
+  TYPE_IMMSREG,
+  TYPE_RSR,
+  TYPE_MEM
+};
+
+enum ShiftType
+{
+  ST_LSL = 0,
+  ST_LSR = 1,
+  ST_ASR = 2,
+  ST_ROR = 3,
+};
+
+enum IndexType
+{
+  INDEX_UNSIGNED,
+  INDEX_POST,
+  INDEX_PRE,
+  INDEX_SIGNED,  // used in LDP/STP
+};
+
+enum ShiftAmount
+{
+  SHIFT_0 = 0,
+  SHIFT_16 = 1,
+  SHIFT_32 = 2,
+  SHIFT_48 = 3,
+};
+
+enum RoundingMode
+{
+  ROUND_A,  // round to nearest, ties to away
+  ROUND_M,  // round towards -inf
+  ROUND_N,  // round to nearest, ties to even
+  ROUND_P,  // round towards +inf
+  ROUND_Z,  // round towards zero
+};
+
+struct FixupBranch
+{
+  ptrdiff_t ptr;
+  // Type defines
+  // 0 = CBZ (32bit)
+  // 1 = CBNZ (32bit)
+  // 2 = B (conditional)
+  // 3 = TBZ
+  // 4 = TBNZ
+  // 5 = B (unconditional)
+  // 6 = BL (unconditional)
+  u32 type;
+
+  // Used with B.cond
+  CCFlags cond;
+
+  // Used with TBZ/TBNZ
+  u8 bit;
+
+  // Used with Test/Compare and Branch
+  ARM64Reg reg;
+};
+
+enum PStateField
+{
+  FIELD_SPSel = 0,
+  FIELD_DAIFSet,
+  FIELD_DAIFClr,
+  FIELD_NZCV,  // The only system registers accessible from EL0 (user space)
+  FIELD_PMCR_EL0,
+  FIELD_PMCCNTR_EL0,
+  FIELD_FPCR = 0x340,
+  FIELD_FPSR = 0x341,
+};
+
+enum SystemHint
+{
+  HINT_NOP = 0,
+  HINT_YIELD,
+  HINT_WFE,
+  HINT_WFI,
+  HINT_SEV,
+  HINT_SEVL,
+};
+
+enum BarrierType
+{
+  OSHLD = 1,
+  OSHST = 2,
+  OSH = 3,
+  NSHLD = 5,
+  NSHST = 6,
+  NSH = 7,
+  ISHLD = 9,
+  ISHST = 10,
+  ISH = 11,
+  LD = 13,
+  ST = 14,
+  SY = 15,
+};
+
+class ArithOption
+{
+public:
+  enum WidthSpecifier
+  {
+    WIDTH_DEFAULT,
+    WIDTH_32BIT,
+    WIDTH_64BIT,
+  };
+
+  enum ExtendSpecifier
+  {
+    EXTEND_UXTB = 0x0,
+    EXTEND_UXTH = 0x1,
+    EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+    EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+    EXTEND_SXTB = 0x4,
+    EXTEND_SXTH = 0x5,
+    EXTEND_SXTW = 0x6,
+    EXTEND_SXTX = 0x7,
+  };
+
+  enum TypeSpecifier
+  {
+    TYPE_EXTENDEDREG,
+    TYPE_IMM,
+    TYPE_SHIFTEDREG,
+  };
+
+private:
+  ARM64Reg m_destReg;
+  WidthSpecifier m_width;
+  ExtendSpecifier m_extend;
+  TypeSpecifier m_type;
+  ShiftType m_shifttype;
+  u32 m_shift;
+
+public:
+  ArithOption(ARM64Reg Rd, bool index = false)
+  {
+    // Indexed registers are a certain feature of AARch64
+    // On Loadstore instructions that use a register offset
+    // We can have the register as an index
+    // If we are indexing then the offset register will
+    // be shifted to the left so we are indexing at intervals
+    // of the size of what we are loading
+    // 8-bit: Index does nothing
+    // 16-bit: Index LSL 1
+    // 32-bit: Index LSL 2
+    // 64-bit: Index LSL 3
+    if (index)
+      m_shift = 4;
+    else
+      m_shift = 0;
+
+    m_destReg = Rd;
+    m_type = TYPE_EXTENDEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      m_extend = EXTEND_UXTX;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      m_extend = EXTEND_UXTW;
+    }
+    m_shifttype = ST_LSL;
+  }
+  ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
+  {
+    m_destReg = Rd;
+    m_shift = shift;
+    m_shifttype = shift_type;
+    m_type = TYPE_SHIFTEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      if (shift == 64)
+        m_shift = 0;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      if (shift == 32)
+        m_shift = 0;
+    }
+  }
+  TypeSpecifier GetType() const { return m_type; }
+  ARM64Reg GetReg() const { return m_destReg; }
+  u32 GetData() const
+  {
+    switch (m_type)
+    {
+    case TYPE_EXTENDEDREG:
+      return (m_extend << 13) | (m_shift << 10);
+      break;
+    case TYPE_SHIFTEDREG:
+      return (m_shifttype << 22) | (m_shift << 10);
+      break;
+    default:
+      DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
+      break;
+    }
+    return 0;
+  }
+};
+
+class ARM64XEmitter
+{
+  friend class ARM64FloatEmitter;
+
+private:
+  ptrdiff_t m_code;
+  ptrdiff_t m_lastCacheFlushEnd;
+  u8* m_rwbase;
+  u8* m_rxbase;
+
+  void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+  void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+  void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+  void EncodeExceptionInst(u32 instenc, u32 imm);
+  void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+  void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                            ArithOption Option);
+  void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+  void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
+  void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+  void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+  void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+  void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+  void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+  void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                           s32 imm);
+  void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+  void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+  // TODO: make this less ugly
+  // used for Switch where memory is executable and writeable and different addresses
+  // we need to take this for relative addressing in account
+
+  void Write32(u32 value);
+
+public:
+  ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {}
+  ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset)
+  {
+    m_rwbase = rwbase;
+    m_rxbase = rxbase;
+    m_code = offset;
+    m_lastCacheFlushEnd = offset;
+  }
+
+  virtual ~ARM64XEmitter() {}
+  void SetCodePtr(ptrdiff_t ptr);
+  void SetCodePtrUnsafe(ptrdiff_t ptr);
+  void SetCodeBase(u8* rwbase, u8* rxbase);
+  void ReserveCodeSpace(u32 bytes);
+  ptrdiff_t AlignCode16();
+  ptrdiff_t AlignCodePage();
+  ptrdiff_t GetCodeOffset();
+  const u8* GetRWPtr();
+  u8* GetWriteableRWPtr();
+  void* GetRXPtr();
+  void FlushIcache();
+  void FlushIcacheSection(u8* start, u8* end);
+
+  // FixupBranch branching
+  void SetJumpTarget(FixupBranch const& branch);
+  FixupBranch CBZ(ARM64Reg Rt);
+  FixupBranch CBNZ(ARM64Reg Rt);
+  FixupBranch B(CCFlags cond);
+  FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+  FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+  FixupBranch B();
+  FixupBranch BL();
+
+  // Compare and Branch
+  void CBZ(ARM64Reg Rt, const void* ptr);
+  void CBNZ(ARM64Reg Rt, const void* ptr);
+
+  // Conditional Branch
+  void B(CCFlags cond, const void* ptr);
+
+  // Test and Branch
+  void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+  void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+  // Unconditional Branch
+  void B(const void* ptr);
+  void BL(const void* ptr);
+
+  // Unconditional Branch (register)
+  void BR(ARM64Reg Rn);
+  void BLR(ARM64Reg Rn);
+  void RET(ARM64Reg Rn = X30);
+  void ERET();
+  void DRPS();
+
+  // Exception generation
+  void SVC(u32 imm);
+  void HVC(u32 imm);
+  void SMC(u32 imm);
+  void BRK(u32 imm);
+  void HLT(u32 imm);
+  void DCPS1(u32 imm);
+  void DCPS2(u32 imm);
+  void DCPS3(u32 imm);
+
+  // System
+  void _MSR(PStateField field, u8 imm);
+  void _MSR(PStateField field, ARM64Reg Rt);
+  void MRS(ARM64Reg Rt, PStateField field);
+  void CNTVCT(ARM64Reg Rt);
+
+  void HINT(SystemHint op);
+  void CLREX();
+  void DSB(BarrierType type);
+  void DMB(BarrierType type);
+  void ISB(BarrierType type);
+
+  // Add/Subtract (Extended/Shifted register)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+  // Add/Subtract (with carry)
+  void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Conditional Compare (immediate)
+  void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+  // Conditional Compare (register)
+  void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+  // Conditional Select
+  void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Aliases
+  void CSET(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void CSETM(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); }
+  // Data-Processing 1 source
+  void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+  void REV16(ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(ARM64Reg Rd, ARM64Reg Rn);
+  void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+  void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Data-Processing 2 source
+  void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Data-Processing 3 source
+  void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Logical (shifted register)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+  // Wrap the above for saner syntax
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  // Convenience wrappers around ORR. These match the official convenience syntax.
+  void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+  void MOV(ARM64Reg Rd, ARM64Reg Rm);
+  void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+  // Convenience wrappers around UBFM/EXTR.
+  void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+  // Logical (immediate)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); }
+  // Add/subtract (immediate)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+  // Data Processing (Immediate)
+  void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+  // Bitfield move
+  void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+  void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+  // Extract register (ROR with two inputs, if same then faster on A67)
+  void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+  // Aliases
+  void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+
+  void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
+  // Load Register (Literal)
+  void LDR(ARM64Reg Rt, u32 imm);
+  void LDRSW(ARM64Reg Rt, u32 imm);
+  void PRFM(ARM64Reg Rt, u32 imm);
+
+  // Load/Store Exclusive
+  void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+  void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+  void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+  // Load/Store no-allocate pair (offset)
+  void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+  // Load/Store register (immediate indexed)
+  void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store register (register offset)
+  void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Load/Store register (unscaled offset)
+  void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store pair
+  void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Address of label/page PC-relative
+  void ADR(ARM64Reg Rd, s32 imm);
+  void ADRP(ARM64Reg Rd, s32 imm);
+
+  // Wrapper around MOVZ+MOVK
+  void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+  bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+  template <class P>
+  void MOVP2R(ARM64Reg Rd, P* ptr)
+  {
+    ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+    MOVI2R(Rd, (uintptr_t)ptr);
+  }
+
+  // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch
+  // register.
+  void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG)
+  {
+    ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+  }
+  void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                       ARM64Reg scratch);
+  void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+  bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers);
+  void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  // This function expects you to have set up the state.
+  // Overwrites X0 and X30
+  template <typename T, typename... Args>
+  ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
+  {
+    auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+    MOVI2R(X30, (uintptr_t)trampoline);
+    MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+    return X30;
+  }
+
+  void QuickTailCall(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickTailCall(ARM64Reg scratchreg, T func)
+  {
+    QuickTailCall(scratchreg, (const void*)func);
+  }
+
+  // Plain function call
+  void QuickCallFunction(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickCallFunction(ARM64Reg scratchreg, T func)
+  {
+    QuickCallFunction(scratchreg, (const void*)func);
+  }
+};
+
+class ARM64FloatEmitter
+{
+public:
+  ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
+  void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore unscaled
+  void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore single structure
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Loadstore multiple structure
+  void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+  void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+  // Loadstore paired
+  void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  // Loadstore register offset
+  void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Scalar - 1 Source
+  void FABS(ARM64Reg Rd, ARM64Reg Rn);
+  void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+  void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+  void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
+
+  // Scalar - 2 Source
+  void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Scalar - 3 Source. Note - the accumulator is last on ARM!
+  void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+  // Scalar floating point immediate
+  void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+  // Vector
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void NOT(ARM64Reg Rd, ARM64Reg Rn);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
+  void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Move
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+  void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+  // One source
+  void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar convert float to int, in a lot of variants.
+  // Note that the scalar version of this operation has two encodings, one that goes to an integer
+  // register
+  // and one that outputs to a scalar fp register.
+  void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+  void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+  // Scalar convert int to float. No rounding mode specifier necessary.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar fixed point to float. scale is the number of fractional bits.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+  // Float comparison
+  void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMP(ARM64Reg Rn);
+  void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMPE(ARM64Reg Rn);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Conditional select
+  void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Permute
+  void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Shift by immediate
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // vector x indexed element
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+  void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+  // Modified Immediate
+  void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+  void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+  void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+  void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+  void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+  ARM64XEmitter* m_emit;
+  inline void Write32(u32 value) { m_emit->Write32(value); }
+  // Emitting functions
+  void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                         ARM64Reg Rm);
+  void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+  void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn, ARM64Reg Rm);
+  void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
+                       ARM64Reg Rd, ARM64Reg Rn);
+  void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+  void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+                                          ARM64Reg Rm);
+  void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+                          ARM64Reg Rm);
+  void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+  void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+                         int opcode);
+  void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                           ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+}
\ No newline at end of file
diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h
new file mode 100644
index 0000000..6d82e9d
--- /dev/null
+++ b/src/dolphin/ArmCommon.h
@@ -0,0 +1,27 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "../types.h"
+
+enum CCFlags
+{
+  CC_EQ = 0,      // Equal
+  CC_NEQ,         // Not equal
+  CC_CS,          // Carry Set
+  CC_CC,          // Carry Clear
+  CC_MI,          // Minus (Negative)
+  CC_PL,          // Plus
+  CC_VS,          // Overflow
+  CC_VC,          // No Overflow
+  CC_HI,          // Unsigned higher
+  CC_LS,          // Unsigned lower or same
+  CC_GE,          // Signed greater than or equal
+  CC_LT,          // Signed less than
+  CC_GT,          // Signed greater than
+  CC_LE,          // Signed less than or equal
+  CC_AL,          // Always (unconditional) 14
+  CC_HS = CC_CS,  // Alias of CC_CS  Unsigned higher or same
+  CC_LO = CC_CC,  // Alias of CC_CC  Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h
new file mode 100644
index 0000000..8b64a92
--- /dev/null
+++ b/src/dolphin/BitUtils.h
@@ -0,0 +1,254 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+
+namespace Common
+{
+///
+/// Retrieves the size of a type in bits.
+///
+/// @tparam T Type to get the size of.
+///
+/// @return the size of the type in bits.
+///
+template <typename T>
+constexpr size_t BitSize() noexcept
+{
+  return sizeof(T) * CHAR_BIT;
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+/// @param  bit The bit to extract.
+///
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <typename T>
+constexpr T ExtractBit(const T src, const size_t bit) noexcept
+{
+  return (src >> bit) & static_cast<T>(1);
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+///
+/// @tparam bit The bit to extract.
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <size_t bit, typename T>
+constexpr T ExtractBit(const T src) noexcept
+{
+  static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width.");
+
+  return ExtractBit(src, bit);
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+/// @param  begin  The beginning of the bit range. This is inclusive.
+/// @param  end    The ending of the bit range. This is inclusive.
+///
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept
+{
+  return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >>
+                              (BitSize<T>() - end + begin - 1)));
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+///
+/// @tparam begin  The beginning of the bit range. This is inclusive.
+/// @tparam end    The ending of the bit range. This is inclusive.
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src) noexcept
+{
+  static_assert(begin < end, "Beginning bit must be less than the ending bit.");
+  static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width.");
+  static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width.");
+
+  return ExtractBits<T, Result>(src, begin, end);
+}
+
+///
+/// Rotates a value left (ROL).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateLeft(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount)));
+}
+
+///
+/// Rotates a value right (ROR).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateRight(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount)));
+}
+
+///
+/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11.
+/// Both edge cases of all zeros and all ones are considered valid masks, too.
+///
+/// @param  mask The mask value to test for validity.
+///
+/// @tparam T    The type of the value.
+///
+/// @return A bool indicating whether the mask is valid.
+///
+template <typename T>
+constexpr bool IsValidLowMask(const T mask) noexcept
+{
+  static_assert(std::is_integral<T>::value, "Mask must be an integral type.");
+  static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs.");
+
+  // Can be efficiently determined without looping or bit counting. It's the counterpart
+  // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+  // and doesn't require special casing either edge case.
+  return (mask & (mask + 1)) == 0;
+}
+
+///
+/// Reinterpret objects of one type as another by bit-casting between object representations.
+///
+/// @remark This is the example implementation of std::bit_cast which is to be included
+///         in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html
+///         for more details. The only difference is this variant is not constexpr,
+///         as the mechanism for bit_cast requires a compiler built-in to have that quality.
+///
+/// @param source The source object to convert to another representation.
+///
+/// @tparam To   The type to reinterpret source as.
+/// @tparam From The initial type representation of source.
+///
+/// @return The representation of type From as type To.
+///
+/// @pre Both To and From types must be the same size
+/// @pre Both To and From types must satisfy the TriviallyCopyable concept.
+///
+template <typename To, typename From>
+inline To BitCast(const From& source) noexcept
+{
+  static_assert(sizeof(From) == sizeof(To),
+                "BitCast source and destination types must be equal in size.");
+  static_assert(std::is_trivially_copyable<From>(),
+                "BitCast source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<To>(),
+                "BitCast destination type must be trivially copyable.");
+
+  std::aligned_storage_t<sizeof(To), alignof(To)> storage;
+  std::memcpy(&storage, &source, sizeof(storage));
+  return reinterpret_cast<To&>(storage);
+}
+
+template <typename T, typename PtrType>
+class BitCastPtrType
+{
+public:
+  static_assert(std::is_trivially_copyable<PtrType>(),
+                "BitCastPtr source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<T>(),
+                "BitCastPtr destination type must be trivially copyable.");
+
+  explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {}
+
+  // Enable operator= only for pointers to non-const data
+  template <typename S>
+  inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type
+  operator=(const S& source)
+  {
+    std::memcpy(m_ptr, &source, sizeof(source));
+  }
+
+  inline operator T() const
+  {
+    T result;
+    std::memcpy(&result, m_ptr, sizeof(result));
+    return result;
+  }
+
+private:
+  PtrType* m_ptr;
+};
+
+// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs
+// Conversion constructor and operator= provided for a convenient syntax.
+// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr);
+// BitCastPtr<MyStruct>(some_ptr) = s;
+template <typename T, typename PtrType>
+inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType>
+{
+  return BitCastPtrType<T, PtrType>{ptr};
+}
+
+template <typename T>
+void SetBit(T& value, size_t bit_number, bool bit_value)
+{
+  static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types.");
+
+  if (bit_value)
+    value |= (T{1} << bit_number);
+  else
+    value &= ~(T{1} << bit_number);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
index f2f52a5..787d505 100644
--- a/src/dolphin/Compat.h
+++ b/src/dolphin/Compat.h
@@ -61,3 +61,15 @@
     { \
         printf(fmt "\n", ## __VA_ARGS__); \
     } while (false)
+
+#if __cplusplus < 201703L
+// cheat
+namespace std
+{
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+  return v < lo ? lo : (v > hi ? hi : v);
+}
+}
+#endif
\ No newline at end of file
diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp
new file mode 100644
index 0000000..70f2ede
--- /dev/null
+++ b/src/dolphin/MathUtil.cpp
@@ -0,0 +1,13 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "MathUtil.h"
+
+#include <numeric>
+
+// Calculate sum of a float list
+float MathFloatVectorSum(const std::vector<float>& Vec)
+{
+  return std::accumulate(Vec.begin(), Vec.end(), 0.0f);
+}
diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h
new file mode 100644
index 0000000..b1dbbae
--- /dev/null
+++ b/src/dolphin/MathUtil.h
@@ -0,0 +1,121 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "Compat.h"
+
+#include "../types.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace MathUtil
+{
+constexpr double TAU = 6.2831853071795865;
+constexpr double PI = TAU / 2;
+
+template <typename T>
+constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{}))
+{
+  return (T{} < val) - (val < T{});
+}
+
+template <typename T, typename F>
+constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a)
+{
+  return x + (y - x) * a;
+}
+
+template <typename T>
+constexpr bool IsPow2(T imm)
+{
+  return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
+constexpr u32 NextPowerOf2(u32 value)
+{
+  --value;
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  ++value;
+
+  return value;
+}
+
+template <class T>
+struct Rectangle
+{
+  T left{};
+  T top{};
+  T right{};
+  T bottom{};
+
+  constexpr Rectangle() = default;
+
+  constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom)
+      : left(theLeft), top(theTop), right(theRight), bottom(theBottom)
+  {
+  }
+
+  constexpr bool operator==(const Rectangle& r) const
+  {
+    return left == r.left && top == r.top && right == r.right && bottom == r.bottom;
+  }
+
+  T GetWidth() const { return abs(right - left); }
+  T GetHeight() const { return abs(bottom - top); }
+  // If the rectangle is in a coordinate system with a lower-left origin, use
+  // this Clamp.
+  void ClampLL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y2, y1);
+    bottom = std::clamp(bottom, y2, y1);
+  }
+
+  // If the rectangle is in a coordinate system with an upper-left origin,
+  // use this Clamp.
+  void ClampUL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y1, y2);
+    bottom = std::clamp(bottom, y1, y2);
+  }
+};
+
+}  // namespace MathUtil
+
+float MathFloatVectorSum(const std::vector<float>&);
+
+// Rounds down. 0 -> undefined
+inline int IntLog2(u64 val)
+{
+#if defined(__GNUC__)
+  return 63 - __builtin_clzll(val);
+
+#elif defined(_MSC_VER)
+  unsigned long result = ULONG_MAX;
+  _BitScanReverse64(&result, val);
+  return result;
+
+#else
+  int result = -1;
+  while (val != 0)
+  {
+    val >>= 1;
+    ++result;
+  }
+  return result;
+#endif
+}
-- 
cgit v1.2.3


From 0d83e98e04548eb7b860df53be0a76e9ecb0809b Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:33:05 +0100
Subject: apply fixes for aarch64 linux by @nadiaholmquist

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp | 16 ++++++++++++++++
 src/dolphin/Arm64Emitter.cpp       |  2 +-
 src/dolphin/Arm64Emitter.h         |  1 -
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 89d0029..b598ac8 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -8,6 +8,9 @@
 #include "../switch/compat_switch.h"
 
 extern char __start__;
+#else
+#include <sys/mman.h>
+#include <unistd.h>
 #endif
 
 #include <malloc.h>
@@ -34,6 +37,9 @@ template <>
 const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
 
 const int JitMemSize = 16 * 1024 * 1024;
+#ifndef __SWITCH__
+u8 JitMem[JitMemSize];
+#endif
 
 void Compiler::MovePC()
 {
@@ -76,6 +82,16 @@ Compiler::Compiler()
     SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
     JitMemUseableSize = JitMemSize;
     Reset();
+#else
+    #else
+    u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
+    u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned;
+    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+
+    SetCodeBase(pageAligned, pageAligned);
+    JitMemUseableSize = alignedSize;
+    Reset();
 #endif
 
     for (int i = 0; i < 3; i++)
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
index dbcf425..dd2416b 100644
--- a/src/dolphin/Arm64Emitter.cpp
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -8,9 +8,9 @@
 #include <cstring>
 #include <vector>
 
+#include "Compat.h"
 #include "Align.h"
 #include "Arm64Emitter.h"
-#include "Assert.h"
 #include "BitUtils.h"
 #include "../types.h"
 #include "MathUtil.h"
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
index 4cb9ff7..3d9d4ba 100644
--- a/src/dolphin/Arm64Emitter.h
+++ b/src/dolphin/Arm64Emitter.h
@@ -8,7 +8,6 @@
 #include <functional>
 
 #include "ArmCommon.h"
-#include "Assert.h"
 #include "BitSet.h"
 #include "Compat.h"
 
-- 
cgit v1.2.3


From 3173e6e25d4456ec3ba26bed18d212bdf6cdfe81 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 18:50:16 +0100
Subject: re add error for unsupported JIT platforms

---
 src/ARMJIT.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 561fabb..208801e 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -8,8 +8,10 @@
 #include "ARMJIT_Internal.h"
 #if defined(__x86_64__)
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
-#else
+#elif defined(__aarch64__)
 #include "ARMJIT_A64/ARMJIT_Compiler.h"
+#else
+#error "The current target platform doesn't have a JIT backend"
 #endif
 
 #include "ARMInterpreter_ALU.h"
-- 
cgit v1.2.3


From 272542972775368cec990561e419e84e34e09fe8 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 4 Feb 2020 19:07:30 +0100
Subject: fix LDM usermode for aarch64 as well

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  | 3 ++-
 src/ARMJIT_A64/ARMJIT_Compiler.h    | 2 ++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index b598ac8..d61cc9c 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -357,7 +357,8 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    F(Nop)
 };
 #undef F
 #define F(x) &Compiler::T_Comp_##x
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 7e13507..5c9ef41 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -103,6 +103,8 @@ public:
     void LoadCPSR();
     void SaveCPSR(bool markClean = true);
 
+    void Nop() {}
+
     void A_Comp_ALUTriOp();
     void A_Comp_ALUMovOp();
     void A_Comp_ALUCmpOp();
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index a5d0e3f..4fd8559 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -639,7 +639,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 if (RegCache.Mapping[reg] != INVALID_REG)
                     MOV(W3, MapReg(reg));
-- 
cgit v1.2.3


From d2acceb36754df349cf5d483155f71332a50000c Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:08:29 +0100
Subject: fixup for aarch64 JIT

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp |  1 -
 src/ARMJIT_RegisterCache.h         | 10 ++++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index d61cc9c..2033307 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -83,7 +83,6 @@ Compiler::Compiler()
     JitMemUseableSize = JitMemSize;
     Reset();
 #else
-    #else
     u64 pageSize = sysconf(_SC_PAGE_SIZE);
     u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
     u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned;
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index b894657..8460825 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -18,11 +18,15 @@ public:
     RegisterCache()
     {}
 
-	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount)
+	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount, bool pcAllocatableAsSrc = false)
 		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
     {
         for (int i = 0; i < 16; i++)
             Mapping[i] = (Reg)-1;
+        
+        PCAllocatableAsSrc = ~(pcAllocatableAsSrc
+            ? 0
+            : (1 << 15));
     }
 
     void UnloadRegister(int reg)
@@ -120,7 +124,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
         u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -184,6 +188,8 @@ public:
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
 
+    u16 PCAllocatableAsSrc = 0;
+
 	T* Compiler;
 
 	FetchedInstr* Instrs;
-- 
cgit v1.2.3


From 262dc7ad0003e31078c219ba3f295e018cf14a1e Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:12:09 +0100
Subject: this it should work

---
 src/ARMJIT_RegisterCache.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 8460825..d4e5539 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -95,6 +95,20 @@ public:
         LiteralsLoaded = 0;
     }
 
+    BitSet32 GetPushRegs()
+    {
+        BitSet16 used;
+        for (int i = 0; i < InstrsCount; i++)
+            used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs);
+
+        BitSet32 res;
+        u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable);
+        for (int i = 0; i < registersMax; i++)
+            res |= BitSet32(1 << (int)NativeRegAllocOrder[i]);
+
+        return res;
+    }
+
 	void Prepare(bool thumb, int i)
     {
         FetchedInstr instr = Instrs[i];
@@ -111,7 +125,7 @@ public:
         for (int j = 0; j < 16; j++)
             ranking[j] = 0;
         for (int j = i; j < InstrsCount; j++)
-        {
+        {s
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
             regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
-- 
cgit v1.2.3


From c8b7a34383c2800845892fd3c1a06c09dab89349 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:21:08 +0100
Subject: git played a prank on me haha very funny

---
 src/ARMJIT_RegisterCache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index d4e5539..5e18e84 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -125,7 +125,7 @@ public:
         for (int j = 0; j < 16; j++)
             ranking[j] = 0;
         for (int j = i; j < InstrsCount; j++)
-        {s
+        {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
             regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
@@ -212,4 +212,4 @@ public:
 
 }
 
-#endif
\ No newline at end of file
+#endif
-- 
cgit v1.2.3


From 225f90cced63c3e282ade81b0c4807f96fc96d59 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:34:26 +0100
Subject: the time of good commit names is long gone

---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 2033307..513c117 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -357,7 +357,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
     NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-    F(Nop)
+    &Compiler::Nop
 };
 #undef F
 #define F(x) &Compiler::T_Comp_##x
-- 
cgit v1.2.3


From 5ab56cef5fdc9f49cdf19ab719c8d63dd831081f Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 7 Feb 2020 00:43:05 +0100
Subject: this mistake was phenomally stupid

---
 src/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'src')

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fce9e49..c34ba3b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,10 +54,6 @@ add_library(core STATIC
 if (ENABLE_JIT)
 	target_sources(core PRIVATE
 		ARMJIT.cpp
-		ARMJIT_x64/ARMJIT_Compiler.cpp
-		ARMJIT_x64/ARMJIT_ALU.cpp
-		ARMJIT_x64/ARMJIT_LoadStore.cpp
-		ARMJIT_x64/ARMJIT_Branch.cpp
 
 		dolphin/CommonFuncs.cpp
 	)
-- 
cgit v1.2.3


From 3098c6a9a03dc284b780b19962d35bebde62ea35 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Mon, 6 Apr 2020 12:25:35 +0200
Subject: preparations for block linking

---
 src/ARMJIT_Internal.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index fb05f75..b968dcb 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -86,6 +86,14 @@ struct __attribute__((packed)) TinyVector
 		Capacity = capacity;
 	}
 
+	void SetLength(u16 length)
+	{
+		if (Capacity < length)
+			MakeCapacity(length);
+		
+		Length = length;
+	}
+
 	void Clear()
 	{
 		Length = 0;
@@ -147,12 +155,7 @@ public:
 	{
 		NumInstrs = numInstrs;
 		NumAddresses = numAddresses;
-		Data = new u32[numInstrs + numAddresses];
-	}
-
-	~JitBlock()
-	{
-		delete[] Data;
+		Data.SetLength(numInstrs + numAddresses);
 	}
 
 	u32 StartAddr;
@@ -160,13 +163,14 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
+	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
 	u32* Instrs()
-	{ return Data; }
+	{ return &Data[0]; }
 	u32* AddressRanges()
-	{ return Data + NumInstrs; }
+	{ return &Data[NumInstrs]; }
 
 private:
 	/*
@@ -174,7 +178,7 @@ private:
 		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
 			(atleast one, the pseudo physical address of the block)
 	*/
-	u32* Data;
+	TinyVector<u32> Data;
 };
 
 // size should be 16 bytes because I'm to lazy to use mul and whatnot
-- 
cgit v1.2.3


From 3ab9e4a4c91d482917782a7ac8f88beeab97a5b2 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Mon, 6 Apr 2020 12:31:20 +0200
Subject: arm64 fix itcm invalidation and ldm^/stm^

---
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 50 ++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index 4fd8559..6cf710b 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -109,12 +109,12 @@ void* Compiler::Gen_MemoryRoutine9(int size, bool store)
     ANDI2R(W3, W0, 0x7FFF & addressMask);
     if (store)
     {
-        LSR(W0, W3, 8);
-        ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4);
+        LSR(W5, W0, 9);
         MOVP2R(X4, CodeRanges);
-        ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4));
+        ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4));
         static_assert(sizeof(AddressRange) == 16);
-        LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
+        LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
         FixupBranch null = CBZ(W4);
         ABI_PushRegisters({1, 3, 30});
         QuickCallFunction(X4, InvalidateByAddr);
@@ -211,34 +211,34 @@ void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
 
     ANDI2R(W4, W0, ~3 & 0x7FFF);
 
+    ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5);
     if (store)
     {
-        LSR(W6, W4, 8);
-        ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        LDR(X5, X1, ArithOption(X2, true));
+        STR(W5, RCPU, X6);
+    }
+    else
+    {
+        LDR(W5, RCPU, X6);
+        STR(X5, X1, ArithOption(X2, true));
+    }
+
+    if (store)
+    {
+        ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5);
+        LSR(W6, W4, 9);
         MOVP2R(X5, CodeRanges);
         ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
         static_assert(sizeof(AddressRange) == 16);
-        LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
+        LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
         FixupBranch null = CBZ(W5);
         ABI_PushRegisters({0, 1, 2, 4, 30});
-        MOV(W0, W6);
+        MOV(W0, W4);
         QuickCallFunction(X5, InvalidateByAddr);
         ABI_PopRegisters({0, 1, 2, 4, 30});
         SetJumpTarget(null);
     }
 
-    ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5);
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X4);
-    }
-    else
-    {
-        LDR(W5, RCPU, X4);
-        STR(X5, X1, ArithOption(X2, true));
-    }
-
     if (!preinc)
         ADD(W0, W0, 4);
     CBNZ(W2, loopStart);
@@ -639,7 +639,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && !regs[15] && reg >= 8 && reg < 15)
+            if (usermode && reg >= 8 && reg < 15)
             {
                 if (RegCache.Mapping[reg] != INVALID_REG)
                     MOV(W3, MapReg(reg));
@@ -663,7 +663,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     second = MapReg(*nextReg);
                 else
                     LoadReg(*nextReg, W4);
-                
+
                 STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
 
                 i--;
@@ -696,7 +696,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     {
         Comp_AddCycles_CDI();
 
-        if (usermode && (regs & BitSet16(0x7f00)))
+        if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
             UBFX(W0, RCPSR, 0, 5);
 
         int i = regsCount - 1;
@@ -708,7 +708,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             int reg = *it;
 
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 LDR(INDEX_UNSIGNED, W3, SP, i * 8);
                 MOVI2R(W1, reg - 8);
@@ -739,7 +739,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     if (*nextReg != 15)
                         RegCache.DirtyRegs |= 1 << *nextReg;
                 }
-                
+
                 LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
 
                 if (first == W3)
-- 
cgit v1.2.3


From 1ad90cb334125090a0317efe522c36c6f285e556 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 16 Apr 2020 16:40:29 +0200
Subject: include more information in DataRegion

---
 src/ARM.h                          | 16 ++++++++--------
 src/ARMJIT_A64/ARMJIT_Compiler.cpp |  4 ++--
 src/ARMJIT_Internal.h              |  7 +++++--
 src/ARMJIT_x64/ARMJIT_Compiler.cpp |  4 ++--
 src/CP15.cpp                       | 12 ++++++++++++
 5 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'src')

diff --git a/src/ARM.h b/src/ARM.h
index 7ef1938..ccef265 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -320,7 +320,7 @@ public:
     void DataRead8(u32 addr, u32* val)
     {
         *val = BusRead8(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -329,7 +329,7 @@ public:
         addr &= ~1;
 
         *val = BusRead16(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -338,7 +338,7 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -353,7 +353,7 @@ public:
     void DataWrite8(u32 addr, u8 val)
     {
         BusWrite8(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -362,7 +362,7 @@ public:
         addr &= ~1;
 
         BusWrite16(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -371,7 +371,7 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataRegion = addr >> 24;
+        DataRegion = addr >> 20;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -402,7 +402,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02) // mainRAM
+        if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 Cycles += numC + numD;
@@ -429,7 +429,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02)
+        if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 Cycles += numC + numD;
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 513c117..00fa436 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02)
+        if ((CurInstr.DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index b968dcb..0d6add9 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -40,9 +40,9 @@ struct FetchedInstr
     u32 Instr;
 	u32 Addr;
 
-    u8 CodeCycles;
 	u8 DataCycles;
-	u8 DataRegion;
+    u16 CodeCycles;
+	u32 DataRegion;
 
     ARMInstrInfo::Info Info;
 };
@@ -195,6 +195,9 @@ typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
+extern u8 MemRegion9[0x80000];
+extern u8 MemRegion7[0x80000];
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 5afe842..d69bdff 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -578,7 +578,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -623,7 +623,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if (CurInstr.DataRegion == 0x02)
+        if ((CurInstr.DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 8a9b31d..e168d7f 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -729,6 +729,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
 
 void ARMv5::DataRead8(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
@@ -748,6 +750,8 @@ void ARMv5::DataRead8(u32 addr, u32* val)
 
 void ARMv5::DataRead16(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
@@ -769,6 +773,8 @@ void ARMv5::DataRead16(u32 addr, u32* val)
 
 void ARMv5::DataRead32(u32 addr, u32* val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
@@ -811,6 +817,8 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
 
 void ARMv5::DataWrite8(u32 addr, u8 val)
 {
+    DataRegion = addr >> 12;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
@@ -833,6 +841,8 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
 
 void ARMv5::DataWrite16(u32 addr, u16 val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
@@ -857,6 +867,8 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
 
 void ARMv5::DataWrite32(u32 addr, u32 val)
 {
+    DataRegion = addr >> 12;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
-- 
cgit v1.2.3


From 1c07932b40e6e072c6ea66c49889860252e45186 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 13:40:51 +0200
Subject: implement block linking + some refactoring currently only supported
 for x64

---
 .gitignore                           |    2 +
 src/ARM.cpp                          |   37 +-
 src/ARM.h                            |   32 +-
 src/ARMJIT.cpp                       |  223 +++-
 src/ARMJIT.h                         |   10 +-
 src/ARMJIT_Internal.h                |   24 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp     |   23 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp   |  140 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h     |   19 +-
 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp |   15 +
 src/ARMJIT_x64/ARMJIT_Linkage.s      |   74 ++
 src/ARMJIT_x64/ARMJIT_Offsets.h      |    3 +
 src/CMakeLists.txt                   |    6 +
 src/Config.cpp                       |    8 +-
 src/Config.h                         |    6 +-
 src/xxhash/xxh3.h                    | 2390 ++++++++++++++++++++++++++++++++++
 src/xxhash/xxhash.c                  |   43 +
 src/xxhash/xxhash.h                  | 1965 ++++++++++++++++++++++++++++
 18 files changed, 4870 insertions(+), 150 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h
 create mode 100644 src/xxhash/xxh3.h
 create mode 100644 src/xxhash/xxhash.c
 create mode 100644 src/xxhash/xxhash.h

(limited to 'src')

diff --git a/.gitignore b/.gitignore
index dd81614..3c87740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ melon_grc.h
 cmake-build
 cmake-build-debug
 .idea
+
+*.exe
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 896bb5c..3eac74d 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -252,15 +252,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (addr & 0x2)
         {
             NextInstr[0] = CodeRead32(addr-2, true) >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
             NextInstr[1] = CodeRead32(addr+2, false);
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
         else
         {
             NextInstr[0] = CodeRead32(addr, true);
             NextInstr[1] = NextInstr[0] >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
 
         CPSR |= 0x20;
@@ -273,9 +273,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (newregion != oldregion) SetupCodeMem(addr);
 
         NextInstr[0] = CodeRead32(addr, true);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
         NextInstr[1] = CodeRead32(addr+4, false);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
 
         CPSR &= ~0x20;
     }
@@ -315,7 +315,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead16(addr);
         NextInstr[1] = CodeRead16(addr+2);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
 
         CPSR |= 0x20;
     }
@@ -328,7 +328,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead32(addr);
         NextInstr[1] = CodeRead32(addr+4);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
 
         CPSR &= ~0x20;
     }
@@ -587,7 +587,7 @@ void ARMv5::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM9Timestamp += Cycles;
+        NDS::ARM9Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -627,14 +627,16 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
 
         if (StopExecution)
         {
@@ -728,7 +730,7 @@ void ARMv4::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM7Timestamp += Cycles;
+        NDS::ARM7Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -768,14 +770,15 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index ccef265..b71102a 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -193,14 +193,14 @@ public:
     {
         // code only. always nonseq 32-bit for ARM9.
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC;
+        Cycles -= numC;
     }
 
     void AddCycles_CI(s32 numI)
     {
         // code+internal
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC + numI;
+        Cycles -= numC + numI;
     }
 
     void AddCycles_CDI()
@@ -211,9 +211,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void AddCycles_CD()
@@ -223,9 +223,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void GetCodeMemRegion(u32 addr, NDS::MemRegion* region);
@@ -387,13 +387,13 @@ public:
     void AddCycles_C()
     {
         // code only. this code fetch is sequential.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
     }
 
     void AddCycles_CI(s32 num)
     {
         // code+internal. results in a nonseq code fetch.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
     }
 
     void AddCycles_CDI()
@@ -405,21 +405,21 @@ public:
         if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
             {
                 numC++;
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
             }
         }
         else if (CodeRegion == 0x02)
         {
             numD++;
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD + 1;
+            Cycles -= numC + numD + 1;
         }
     }
 
@@ -432,17 +432,17 @@ public:
         if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else if (CodeRegion == 0x02)
         {
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD;
+            Cycles -= numC + numD;
         }
     }
 };
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 208801e..cc8d4ce 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,10 @@
 
 #include <string.h>
 #include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
 
 #include "Config.h"
 
@@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = {
 u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
-JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-TinyVector<JitBlock*> JitBlocks;
-JitBlock* RestoreCandidates[0x1000] = {NULL};
+std::unordered_map<u32, JitBlock*> JitBlocks;
 
-u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
+template <typename K, typename V, int Size, V InvalidValue>
+struct UnreliableHashTable
 {
-	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
-}
+	struct Bucket
+	{
+		K KeyA, KeyB;
+		V ValA, ValB;
+	};
+
+	Bucket Table[Size];
+
+	void Reset()
+	{
+		for (int i = 0; i < Size; i++)
+		{
+			Table[i].ValA = Table[i].ValB = InvalidValue;
+		}
+	}
+
+	UnreliableHashTable()
+	{
+		Reset();
+	}
+
+	V Insert(K key, V value)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA == value || bucket->ValB == value)
+		{
+			return InvalidValue;
+		}
+		else if (bucket->ValA == InvalidValue)
+		{
+			bucket->KeyA = key;
+			bucket->ValA = value;
+		}
+		else if (bucket->ValB == InvalidValue)
+		{
+			bucket->KeyB = key;
+			bucket->ValB = value;
+		}
+		else
+		{
+			V prevVal = bucket->ValB;
+			bucket->KeyB = bucket->KeyA;
+			bucket->ValB = bucket->ValA;
+			bucket->KeyA = key;
+			bucket->ValA = value;
+			return prevVal;
+		}
+
+		return InvalidValue;
+	}
+
+	void Remove(K key)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->KeyA == key && bucket->ValA != InvalidValue)
+		{
+			bucket->ValA = InvalidValue;
+			if (bucket->ValB != InvalidValue)
+			{
+				bucket->KeyA = bucket->KeyB;
+				bucket->ValA = bucket->ValB;
+				bucket->ValB = InvalidValue;
+			}
+		}
+		if (bucket->KeyB == key && bucket->ValB != InvalidValue)
+			bucket->ValB = InvalidValue;
+	}
+
+	V LookUp(K addr)
+	{
+		u32 slot = XXH3_64bits(&addr, 4) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA != InvalidValue && bucket->KeyA == addr)
+			return bucket->ValA;
+		if (bucket->ValB != InvalidValue && bucket->KeyB == addr)
+			return bucket->ValB;
+
+		return InvalidValue;
+	}
+};
+
+UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
+UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
 
 void Init()
 {
@@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
-		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
@@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu)
 
 			if (staticBranch)
 			{
+				instrs[i].BranchFlags |= branch_StaticTarget;
+
 				bool isBackJump = false;
 				if (hasBranched)
 				{
@@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
-	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
-	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
-	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	if (prevBlock)
 	{
-		RestoreCandidates[restoreSlot] = NULL;	
+		RestoreCandidates.Remove(pseudoPhysicalAddr);
 		if (prevBlock->NumInstrs == i)
 		{
 			for (int j = 0; j < i; j++)
@@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu)
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
-	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
-
-	JitBlocks.Add(block);
+	JitBlocks[pseudoPhysicalAddr] = block;
+	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
 }
 
 void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
@@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 			}
 		}
 
-		bool removed = JitBlocks.RemoveByValue(block);
-		assert(removed);
+		for (int j = 0; j < block->NumLinks(); j++)
+			compiler->UnlinkBlock(block->Links()[j]);
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		JitBlocks.erase(block->PseudoPhysicalAddr);
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
 
 		if (mayRestore)
 		{
-			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-				delete RestoreCandidates[slot];
-
-			RestoreCandidates[slot] = block;
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			if (prevBlock)
+				delete prevBlock;
 		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
@@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr)
 void InvalidateAll()
 {
 	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
-		
-		for (int j = 0; j < block->NumAddresses; j++)
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+
+		for (int i = 0; i < block->NumAddresses; i++)
 		{
-			u32 addr = block->AddressRanges()[j];
+			u32 addr = block->AddressRanges()[i];
 			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
 		}
+		for (int i = 0; i < block->NumLinks(); i++)
+			compiler->UnlinkBlock(block->Links()[i]);
+		block->ResetLinks();
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
-		
-		RestoreCandidates[slot] = block;
+		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+		if (prevBlock)
+			delete prevBlock;
 	}
 
-	JitBlocks.Clear();
+	JitBlocks.clear();
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
-	
-	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
-	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+
+	FastBlockLookUp.Reset();
+	RestoreCandidates.Reset();
+	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
-		if (RestoreCandidates[i])
+		if (RestoreCandidates.Table[i].ValA)
 		{
-			delete RestoreCandidates[i];
-			RestoreCandidates[i] = NULL;
+			delete RestoreCandidates.Table[i].ValA;
+			RestoreCandidates.Table[i].ValA = NULL;
+		}
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValB;
+			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -788,11 +882,43 @@ void ResetBlockCache()
 		}
 		delete block;
 	}
-	JitBlocks.Clear();
+	JitBlocks.clear();
 
 	compiler->Reset();
 }
 
+JitBlockEntry LookUpBlockEntry(u32 addr)
+{
+	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	if (entryOffset != UINT32_MAX)
+		return compiler->AddEntryOffset(entryOffset);
+
+	auto block = JitBlocks.find(addr);
+	if (block != JitBlocks.end())
+	{
+		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		return block->second->EntryPoint;
+	}
+	return NULL;
+}
+
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset)
+{
+	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
+	auto block = JitBlocks.find(targetPseudoPhys);
+	if (block == JitBlocks.end())
+	{
+		CompileBlock(cpu);
+		block = JitBlocks.find(targetPseudoPhys);
+	}
+
+	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
+
+	block->second->AddLink(codeOffset);
+	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
@@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	return NULL;
 }
 
-}
\ No newline at end of file
+}
+
+template void ARMJIT::LinkBlock<0>(ARM*, u32);
+template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 09cc463..cab385f 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000];
 extern u32 AddrTranslate7[0x4000];
 
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
@@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr)
 		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
-template <u32 num>
-inline JitBlockEntry LookUpBlock(u32 addr)
-{
-	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
-}
+JitBlockEntry LookUpBlockEntry(u32 addr);
+
 
 void Init();
 void DeInit();
@@ -73,4 +69,6 @@ void ResetBlockCache();
 
 }
 
+extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
+
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 0d6add9..66d1808 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -15,7 +15,8 @@ enum
 {
 	branch_IdleBranch = 1 << 0,
 	branch_FollowCondTaken = 1 << 1,
-	branch_FollowCondNotTaken = 1 << 2
+	branch_FollowCondNotTaken = 1 << 2,
+	branch_StaticTarget = 1 << 3,
 };
 
 struct FetchedInstr
@@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector
 		assert(capacity > Capacity);
 		T* newMem = new T[capacity];
 		if (Data != NULL)
-			memcpy(newMem, Data, sizeof(Data) * Length);
+			memcpy(newMem, Data, sizeof(T) * Length);
 
 		T* oldData = Data;
 		Data = newMem;
@@ -163,7 +164,6 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
-	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
@@ -171,6 +171,21 @@ public:
 	{ return &Data[0]; }
 	u32* AddressRanges()
 	{ return &Data[NumInstrs]; }
+	u32* Links()
+	{ return &Data[NumInstrs + NumAddresses]; }
+
+	u32 NumLinks()
+	{ return Data.Length - NumInstrs - NumAddresses; }
+
+	void AddLink(u32 link)
+	{
+		Data.Add(link);
+	}
+
+	void ResetLinks()
+	{
+		Data.SetLength(NumInstrs + NumAddresses);
+	}
 
 private:
 	/*
@@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000];
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index e02865d..cac590a 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
@@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     IrregularCycles = true;
 
     BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-    bool previouslyDirty = CPSRDirty;
+    bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
@@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
             LoadReg(reg, RegCache.Mapping[reg]);
     }
 
-    if (previouslyDirty)
-        LoadCPSR();
-    CPSRDirty = previouslyDirty;
+    LoadCPSR();
+    // in case this instruction is skipped
+    if (CurInstr.Cond() < 0xE)
+        CPSRDirty = cpsrDirty;
 }
 
 void Compiler::A_Comp_BranchImm()
@@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_SpecialBranchBehaviour();
+    Comp_SpecialBranchBehaviour(true);
 
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
-    }
+    Comp_SpecialBranchBehaviour(false);
 
     Comp_AddCycles_C(true);
     SetJumpTarget(skipFailed);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d69bdff..be3709e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -1,6 +1,7 @@
 #include "ARMJIT_Compiler.h"
 
 #include "../ARMInterpreter.h"
+#include "../Config.h"
 
 #include <assert.h>
 
@@ -15,6 +16,8 @@
 
 using namespace Gen;
 
+extern "C" void ARM_Ret();
+
 namespace ARMJIT
 {
 template <>
@@ -170,6 +173,24 @@ Compiler::Compiler()
         RET();
     }
 
+    {
+        CPSRDirty = true;
+        BranchStub[0] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<0>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+
+        CPSRDirty = true;
+        BranchStub[1] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<1>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -362,23 +383,43 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-void Compiler::Comp_SpecialBranchBehaviour()
+void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
         RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
+
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
+            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
+        {
+            FixupBranch ret = J_CC(CC_S);
+            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+            FixupBranch ret2 = J_CC(CC_NZ);
+
+            u8* rewritePart = GetWritableCodePtr();
+            NOP(5);
+
+            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+            JMP((u8*)BranchStub[Num], true);
+
+            SetJumpTarget(ret);
+            SetJumpTarget(ret2);
+            JMP((u8*)ARM_Ret, true);
+        }
+        else
+        {
+            JMP((u8*)&ARM_Ret, true);
+        }
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         ResetBlockCache();
@@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     Num = cpu->Num;
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
+    // CPSR might have been modified in a previous block
+    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-
-    MOV(64, R(RCPU), ImmPtr(cpu));
-
-    LoadCPSR();
-
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                 else
                     (this->*comp)();
 
-                Comp_SpecialBranchBehaviour();
+                Comp_SpecialBranchBehaviour(true);
 
                 if (CurInstr.Cond() < 0xE)
                 {
@@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
                         Comp_AddCycles_C(true);
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            RegCache.PrepareExit();
-                            SaveCPSR(false);
-                            
-                            MOV(32, R(RAX), Imm32(ConstantCycles));
-                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-                            RET();
-                        }
+                        Comp_SpecialBranchBehaviour(false);
 
                         SetJumpTarget(skipFailed);
                     }
@@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
             }
         }
 
-        if (comp == NULL && i != instrsCount - 1)
+        if (comp == NULL)
             LoadCPSR();
     }
 
     RegCache.Flush();
-    SaveCPSR();
 
-    MOV(32, R(RAX), Imm32(ConstantCycles));
+    SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+    if (Config::JIT_BrancheOptimisations == 2
+        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
+        && (!instrs[instrsCount - 1].Info.Branches()
+        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
+        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
+    {
+        FixupBranch ret = J_CC(CC_S);
+        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+        FixupBranch ret2 = J_CC(CC_NZ);
+
+        u8* rewritePart = GetWritableCodePtr();
+        NOP(5);
+
+        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+        JMP((u8*)BranchStub[Num], true);
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-    RET();
+        SetJumpTarget(ret);
+        SetJumpTarget(ret2);
+        JMP((u8*)ARM_Ret, true);
+    }
+    else
+    {
+        JMP((u8*)ARM_Ret, true);
+    }
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     return res;
 }
 
+void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    JMP((u8*)entry, true);
+    SetCodePtr(curPtr);
+}
+
+void Compiler::UnlinkBlock(u32 offset)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    NOP(5);
+    SetCodePtr(curPtr);
+}
+
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
@@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (!Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     if (!Thumb && CurInstr.Cond() < 0xE)
     {
         LEA(32, RSCRATCH, MDisp(i, add + cycles));
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
     }
     else
     {
         ConstantCycles += i + cycles;
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
 
@@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+            SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
             ConstantCycles += cycles;
     }
@@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2cb57dc..b428c33 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -51,7 +51,10 @@ public:
 
     void Reset();
 
-    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    void LinkBlock(u32 offset, JitBlockEntry entry);
+    void UnlinkBlock(u32 offset);
+
+    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -145,7 +148,7 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void Comp_SpecialBranchBehaviour();
+    void Comp_SpecialBranchBehaviour(bool taken);
 
     void* Gen_MemoryRoutine9(bool store, int size);
 
@@ -176,12 +179,24 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(ResetStart + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - ResetStart;
+    }
+
     u8* ResetStart;
     u32 CodeMemSize;
 
     bool Exit;
     bool IrregularCycles;
 
+    void* BranchStub[2];
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
new file mode 100644
index 0000000..9696d22
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -0,0 +1,15 @@
+#include "../ARM.h"
+
+int main(int argc, char* argv[])
+{
+    FILE* f = fopen("ARMJIT_Offsets.h", "w");
+#define writeOffset(field) \
+        fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field))
+
+    writeOffset(CPSR);
+    writeOffset(Cycles);
+    writeOffset(StopExecution);
+
+    fclose(f);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..dbbb024
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -0,0 +1,74 @@
+.intel_syntax noprefix
+
+#include "ARMJIT_Offsets.h"
+
+.text
+
+#define RCPU rbp
+#define RCPSR r15d
+
+#ifdef WIN64
+#define ARG1_REG ecx
+#define ARG2_REG edx
+#define ARG3_REG r8d
+#define ARG4_REG r9d
+#define ARG1_REG64 rcx
+#define ARG2_REG64 rdx
+#define ARG3_REG64 r8
+#define ARG4_REG64 r9
+#else
+#define ARG1_REG edi
+#define ARG2_REG esi
+#define ARG3_REG edx
+#define ARG4_REG ecx
+#define ARG1_REG64 rdi
+#define ARG2_REG64 rsi
+#define ARG3_REG64 rdx
+#define ARG4_REG64 rcx
+#endif
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+#ifdef WIN64
+    push rdi
+    push rsi
+#endif
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+    push rbp
+
+#ifdef WIN64
+    sub rsp, 0x28
+#endif
+    mov RCPU, ARG1_REG64
+    mov RCPSR, [RCPU + ARM_CPSR_offset]
+
+    jmp ARG2_REG64
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    mov [RCPU + ARM_CPSR_offset], RCPSR
+
+#ifdef WIN64
+    add rsp, 0x28
+#endif
+
+    pop rbp
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+#ifdef WIN64
+    pop rsi
+    pop rdi
+#endif
+
+    ret
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
new file mode 100644
index 0000000..a73dd59
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -0,0 +1,3 @@
+#define ARM_CPSR_offset 0x64
+#define ARM_Cycles_offset 0xc
+#define ARM_StopExecution_offset 0x10
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c34ba3b..a0c3a36 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -49,9 +49,12 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+	xxhash/xxhash.c
 )
 
 if (ENABLE_JIT)
+	enable_language(ASM)
+
 	target_sources(core PRIVATE
 		ARMJIT.cpp
 
@@ -68,7 +71,10 @@ if (ENABLE_JIT)
 			ARMJIT_x64/ARMJIT_ALU.cpp
 			ARMJIT_x64/ARMJIT_LoadStore.cpp
 			ARMJIT_x64/ARMJIT_Branch.cpp
+
+			ARMJIT_x64/ARMJIT_Linkage.s
 		)
+		set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
 	endif()
 	if (ARCHITECTURE STREQUAL ARM64)
 		target_sources(core PRIVATE
diff --git a/src/Config.cpp b/src/Config.cpp
index 07b1e3e..e69319b 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -38,10 +38,10 @@ char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
 #ifdef JIT_ENABLED
-bool JIT_Enable = false;
+int JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
-bool JIT_BrancheOptimisations = true;
-bool JIT_LiteralOptimisations = true;
+int JIT_BrancheOptimisations = 2;
+int JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -58,7 +58,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
diff --git a/src/Config.h b/src/Config.h
index 1fcd9bb..d546524 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -52,10 +52,10 @@ extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
 #ifdef JIT_ENABLED
-extern bool JIT_Enable;
+extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern bool JIT_BrancheOptimisations;
-extern bool JIT_LiteralOptimisations;
+extern int JIT_BrancheOptimisations;
+extern int JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h
new file mode 100644
index 0000000..5d5faf8
--- /dev/null
+++ b/src/xxhash/xxh3.h
@@ -0,0 +1,2390 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file is separated for development purposes.
+ * It will be integrated into `xxhash.h` when development stage is completed.
+ *
+ * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
+ */
+
+#ifndef XXH3_H_1397135465
+#define XXH3_H_1397135465
+
+/* ===   Dependencies   === */
+#ifndef XXHASH_H_5627135585666179
+/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
+#  undef XXH_INLINE_ALL   /* avoid redefinition */
+#  define XXH_INLINE_ALL
+#endif
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0 /* Portable scalar version */
+#define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
+#define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
+#define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
+#define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
+#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator.
+ * This is for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+#  undef vector /* Undo the pollution */
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+ * {
+ *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+ * }
+ */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * We don't need to (or want to) mix as much as XXH64.
+ *
+ * Short hashes are more evenly distributed, so it isn't necessary.
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        xxh_u64 const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len < 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 x = input64 ^ bitflip;
+        /* this mix is inspired by Pelle Evensen's rrmxmx */
+        x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
+        x *= 0x9FB21C651E98DF25ULL;
+        x ^= (x >> 35) + len ;
+        x *= 0x9FB21C651E98DF25ULL;
+        return XXH_xorshift64(x, 28);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(8 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    __asm__ ("" : "+r" (seed64));
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
+
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        if (accWidth == XXH3_acc_128bits) {
+            /* xacc[0] += swap(data_vec); */
+            __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        } else {  /* XXH3_acc_64bits */
+            /* xacc[0] += data_vec; */
+            __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+                __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+#ifdef __s390x__
+            xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
+#else
+            xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+        size_t i;
+        for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+#ifdef __clang__ // for clang
+#  define XXH_PREFETCH_DIST_AVX512_64  320
+#  define XXH_PREFETCH_DIST_AVX512_128 320
+#else // for gcc
+#  define XXH_PREFETCH_DIST_AVX512_64  640
+#  define XXH_PREFETCH_DIST_AVX512_128 512
+#endif
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+#if (XXH_VECTOR == XXH_AVX512)
+        if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
+        else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
+#else
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+#endif
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+            /* Do not align on 8, so that the secret is different from the scrambler */
+#define XXH_SECRET_LASTACC_START 7
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        __asm__("" : "+r" (result64));
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+    /*
+     * We need a separate pointer for the hack below.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8 *kSecretPtr = kSecret;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that kSecretPtr has been changed), the pipelines are used more efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    __asm__("" : "+r" (kSecretPtr));
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == kSecret);
+
+    for (i=0; i < nbRounds; i++) {
+        /*
+         * The asm hack causes Clang to assume that kSecretPtr aliases with
+         * customSecret, and on aarch64, this prevented LDP from merging two
+         * loads together for free. Putting the loads together before the stores
+         * properly generates LDP.
+         */
+        xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+        xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+        XXH_writeLE64(customSecret + 16*i,     lo);
+        XXH_writeLE64(customSecret + 16*i + 8, hi);
+    }
+}
+
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * There is some input left inside the internal buffer.
+         * Fill it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* Consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* Some remaining input: buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc,
+                              state->secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX: short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t h128;
+        h128.low64  = XXH3_avalanche(mixedl);
+        h128.high64 = XXH3_avalanche(mixedh);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
+            h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+         return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         state->secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         state->secret + state->secretLimit + STRIPE_LEN
+                                                       - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH3_H_1397135465 */
diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c
new file mode 100644
index 0000000..0fae88c
--- /dev/null
+++ b/src/xxhash/xxhash.c
@@ -0,0 +1,43 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h
new file mode 100644
index 0000000..67a5887
--- /dev/null
+++ b/src/xxhash/xxhash.h
@@ -0,0 +1,1965 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+/*!
+ * XXH_NAMESPACE, aka Namespace Emulation:
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * XXH32():
+ *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ *  The memory between input & input+length must be valid (allocated and read-accessible).
+ *  "seed" can be used to alter the result predictably.
+ *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*!
+ * XXH64():
+ * Returns the 64-bit hash of sequence of length @length stored at memory
+ * address @input.
+ * @seed can be used to alter the result predictably.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation of an XXH
+ * state, for example, on the stack or in a struct.
+ * Never **ever** access members directly.
+ */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * The XXH3 algorithm is still in development.
+ * The results it produces may still change in future versions.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * Avoid storing values in long-term storage until the algorithm is finalized.
+ *
+ * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
+ * everything remains fine, its current format will be "frozen" and become the
+ * final one.
+ *
+ * After which, return values of XXH3 and XXH128 will no longer change in
+ * future versions.
+ *
+ * XXH3's return values will be officially finalized upon reaching v0.8.0.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional
+ * collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid trivial sequences, such as repeating sequences and especially '\0',
+ * as this can cancel out itself.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly based on the default
+ * secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from the seed. Makes state larger.
+   * Design might change */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   /* note: there is some padding after due to alignment on 64 bytes */
+   const unsigned char* secret;
+};   /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever possible.
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with the default parameters.
+ * The result will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, and must outlive the hash streaming session, so
+ * be careful when using stack arrays.
+ * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         <0 if *h128_1  < *h128_2
+ *         =0 if *h128_1 == *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be found in xxhash.c.
+ *
+ * However, code inlining requires the implementation to be visible to the
+ * compiler, usually within the header.
+ *
+ * As a workaround, xxhash.c used to be included within xxhash.h. This caused
+ * some issues with some build systems, especially ones which treat .c files
+ * as source files.
+ *
+ * Therefore, the implementation is now directly integrated within xxhash.h.
+ * Another small advantage is that xxhash.c is no longer needed in /include.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!
+ * XXH_FORCE_MEMORY_ACCESS:
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow to select a different access method for improved
+ * performance.
+ * Method 0 (default):
+ *     Use `memcpy()`. Safe and portable.
+ * Method 1:
+ *     `__attribute__((packed))` statement. It depends on compiler extensions
+ *     and is therefore not portable.
+ *     This method is safe if your compiler supports it, and *generally* as
+ *     fast or faster than `memcpy`.
+ * Method 2:
+ *     Direct access via cast. This method doesn't depend on the compiler but
+ *     violates the C standard.
+ *     It can generate buggy code on targets which do not support unaligned
+ *     memory accesses.
+ *     But in some circumstances, it's the only known way to get the most
+ *     performance (ie GCC + ARMv6)
+ * Method 3:
+ *     Byteshift. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction.
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!
+ *XXH_ACCEPT_NULL_INPUT_POINTER:
+ * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+ * triggering a segfault.
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!
+ * XXH_FORCE_ALIGN_CHECK:
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means: check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * Set it to 0 when the input is guaranteed to be aligned or when alignment
+ * doesn't matter for performance.
+ *
+ * This option does not affect XXH3.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!
+ * XXH_NO_INLINE_HINTS:
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+/*!
+ * XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang.
+ */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*!
+ * Modify the local functions below should you wish to use some other memory
+ * routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free(void* p) { free(p); }
+
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS /* disable inlining hints */
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#elif defined(_MSC_VER)    /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) \
+    || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*
+ * DEBUGLEVEL is expected to be defined externally, typically via the compiler's
+ * command line options. The value must be a number.
+ */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/*!
+ * XXH_CPU_LITTLE_ENDIAN:
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Nonstandard, but well-defined behavior in practice.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \
+                               && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+     * loop (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize.
+     */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 do {                           \
+    h32 += (*ptr++) * PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1;      \
+} while (0)
+
+#define PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*!
+ * XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+ * performance gain on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+ * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+ * to unroll. The code becomes ridiculously large (the largest function in the
+ * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+ * also slightly faster because it fits into cache better and is more likely
+ * to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+ */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 do {                                   \
+    h64 ^= (*ptr++) * PRIME64_5;                           \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;                 \
+} while (0)
+
+#define PROCESS4_64 do {                                   \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1;      \
+    ptr += 4;                                              \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;     \
+} while (0)
+
+#define PROCESS8_64 do {                                   \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr += 8;                                              \
+    h64 ^= k1;                                             \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;     \
+} while (0)
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
-- 
cgit v1.2.3


From 1c98cefcee727a2ff621bfe9924cf6cd91770379 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 14:42:37 +0200
Subject: compile UMULLs and some fixes

---
 src/ARMJIT_x64/ARMJIT_ALU.cpp       | 33 +++++++++++++++++++++++++--------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  4 ++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  2 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  3 ++-
 4 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 14c223b..43b94b6 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -301,10 +301,11 @@ void Compiler::A_Comp_MUL_MLA()
     Comp_MulOp(S, add, rd, rm, rs, rn);
 }
 
-void Compiler::A_Comp_SMULL_SMLAL()
+void Compiler::A_Comp_Mul_Long()
 {
     bool S = CurInstr.Instr & (1 << 20);
     bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
     OpArg rd = MapReg(CurInstr.A_Reg(16));
     OpArg rm = MapReg(CurInstr.A_Reg(0));
     OpArg rs = MapReg(CurInstr.A_Reg(8));
@@ -318,18 +319,34 @@ void Compiler::A_Comp_SMULL_SMLAL()
         MOV(32, R(RSCRATCH3), rs);
         TEST(32, R(RSCRATCH3), R(RSCRATCH3));
         FixupBranch zeroBSR = J_CC(CC_Z);
-        BSR(32, RSCRATCH2, R(RSCRATCH3));
-        NOT(32, R(RSCRATCH3));
-        BSR(32, RSCRATCH, R(RSCRATCH3));
-        CMP(32, R(RSCRATCH2), R(RSCRATCH));
-        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        if (sign)
+        {
+            BSR(32, RSCRATCH2, R(RSCRATCH3));
+            NOT(32, R(RSCRATCH3));
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+            CMP(32, R(RSCRATCH2), R(RSCRATCH));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        }
+        else
+        {
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+        }
+        
         SHR(32, R(RSCRATCH), Imm8(3));
         SetJumpTarget(zeroBSR); // fortunately that's even right
         Comp_AddCycles_CI(RSCRATCH, 2);
     }
 
-    MOVSX(64, 32, RSCRATCH2, rm);
-    MOVSX(64, 32, RSCRATCH3, rs);
+    if (sign)
+    {
+        MOVSX(64, 32, RSCRATCH2, rm);
+        MOVSX(64, 32, RSCRATCH3, rs);
+    }
+    else
+    {
+        MOV(32, R(RSCRATCH2), rm);
+        MOV(32, R(RSCRATCH3), rs);
+    }
     if (add)
     {
         MOV(32, R(RSCRATCH), rd);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index be3709e..1b2d312 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -300,7 +300,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // CMN
     F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
     // Mul
-    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL,
+    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL,
     // ARMv5 stuff
     F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
     // STR
@@ -628,7 +628,7 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
     else
     {
-        ConstantCycles += i + cycles;
+        ConstantCycles += cycles;
         SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index b428c33..a448b6d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -89,7 +89,7 @@ public:
     void A_Comp_CmpOp();
 
     void A_Comp_MUL_MLA();
-    void A_Comp_SMULL_SMLAL();
+    void A_Comp_Mul_Long();
 
     void A_Comp_CLZ();
     
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 4cafc1c..7f6fa53 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -423,7 +423,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 
                 if (flags & memop_SubtractOffset)
                 {
-                    MOV(32, R(finalAddr), rnMapped);
+                    if (R(finalAddr) != rnMapped)
+                        MOV(32, R(finalAddr), rnMapped);
                     if (!offset.IsZero())
                         SUB(32, R(finalAddr), offset);
                 }
-- 
cgit v1.2.3


From c2dd6a186da3e7e705b970c1fcc6768d00dba08e Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 19:35:40 +0200
Subject: implement msr and mrs for the x64 JIT

---
 src/ARMJIT.cpp                     |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 127 ++++++++++++++++++++++++++++++++++++-
 src/ARMJIT_x64/ARMJIT_Compiler.h   |   3 +
 src/ARM_InstrInfo.cpp              |   4 ++
 4 files changed, 134 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index cc8d4ce..46f71f1 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -824,7 +824,7 @@ void InvalidateITCM(u32 addr)
 
 void InvalidateAll()
 {
-	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size());
 	for (auto it : JitBlocks)
 	{
 		JitBlock* block = it.second;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 1b2d312..52a16dc 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -38,6 +38,131 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(ReadBanked);
+        MOV(32, rd, R(ABI_PARAM3));
+    }
+    else
+        MOV(32, rd, R(RCPSR));
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    OpArg val = CurInstr.Instr & (1 << 25)
+        ? Imm32(ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)))
+        : MapReg(CurInstr.A_Reg(0));
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(ReadBanked);
+
+        MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00));
+        MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF));
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        CMP(32, R(RSCRATCH), Imm8(0x10));
+        CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE);
+        AND(32, R(RSCRATCH2), Imm32(mask));
+
+        MOV(32, R(RSCRATCH), R(RSCRATCH2));
+        NOT(32, R(RSCRATCH));
+        AND(32, R(ABI_PARAM3), R(RSCRATCH));
+
+        AND(32, R(RSCRATCH2), val);
+        OR(32, R(ABI_PARAM3), R(RSCRATCH2));
+
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        CALL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            AND(32, R(RCPSR), Imm32(~mask));
+            if (val.IsImm())
+            {
+                MOV(32, R(RSCRATCH), val);
+                AND(32, R(RSCRATCH), Imm32(mask));
+                OR(32, R(RCPSR), R(RSCRATCH));
+            }
+            else
+            {
+                OR(32, R(RCPSR), Imm32(val.Imm32() & mask));
+            }
+        }
+        else
+        {
+            MOV(32, R(RSCRATCH2), Imm32(mask));
+            MOV(32, R(RSCRATCH3), R(RSCRATCH2));
+            AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00));
+            MOV(32, R(RSCRATCH), R(RCPSR));
+            AND(32, R(RSCRATCH), Imm8(0x1F));
+            CMP(32, R(RSCRATCH), Imm8(0x10));
+            CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E);
+
+            MOV(32, R(RSCRATCH3), R(RCPSR));
+
+            // I need you ANDN
+            MOV(32, R(RSCRATCH), R(RSCRATCH2));
+            NOT(32, R(RSCRATCH));
+            AND(32, R(RCPSR), R(RSCRATCH));
+
+            AND(32, R(RSCRATCH2), val);
+            OR(32, R(RCPSR), R(RSCRATCH2));
+
+            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+            if (Thumb || CurInstr.Cond() >= 0xE)
+                RegCache.Flush();
+            else
+            {
+                // the ugly way...
+                // we only save them, to load and save them again
+                for (int reg : hiRegsLoaded)
+                    SaveReg(reg, RegCache.Mapping[reg]);
+            }
+
+            MOV(32, R(ABI_PARAM3), R(RCPSR));
+            MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
+            MOV(64, R(ABI_PARAM1), R(RCPU));
+            CALL((void*)&ARM::UpdateMode);
+
+            if (!Thumb && CurInstr.Cond() < 0xE)
+            {
+                for (int reg : hiRegsLoaded)
+                    LoadReg(reg, RegCache.Mapping[reg]);
+            }
+        }
+    }
+}
+
 /*
     We'll repurpose this .bss memory
 
@@ -328,7 +453,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
     // system stuff
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL,
     F(Nop)
 };
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index a448b6d..2230eb8 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -100,6 +100,9 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+    void A_Comp_MRS();
+    void A_Comp_MSR();
+
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
     void T_Comp_ALU_Imm8();
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b884773..28362d9 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -427,6 +427,10 @@ Info Decode(bool thumb, u32 num, u32 instr)
                 res.Kind = ak_UNK;
             }
         }
+        if (res.Kind == ak_MRS && !(instr & (1 << 22)))
+            res.ReadFlags |= flag_N | flag_Z | flag_C | flag_V;
+        if ((res.Kind == ak_MSR_IMM || res.Kind == ak_MSR_REG) && instr & (1 << 19))
+            res.WriteFlags |= flag_N | flag_Z | flag_C | flag_V;
 
         if (data & A_Read0)
             res.SrcRegs |= 1 << (instr & 0xF);
-- 
cgit v1.2.3


From dc86bac83d865ebc1b9a520791b831f6799fe87c Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 16:17:16 +0200
Subject: hopefully fix stack handling for linux

---
 src/ARMJIT_x64/ARMJIT_Linkage.s | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
index dbbb024..0a84df0 100644
--- a/src/ARMJIT_x64/ARMJIT_Linkage.s
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -44,6 +44,8 @@ ARM_Dispatch:
 
 #ifdef WIN64
     sub rsp, 0x28
+#else
+    sub rsp, 0x8
 #endif
     mov RCPU, ARG1_REG64
     mov RCPSR, [RCPU + ARM_CPSR_offset]
@@ -58,6 +60,8 @@ ARM_Ret:
 
 #ifdef WIN64
     add rsp, 0x28
+#else
+    add rsp, 0x8
 #endif
 
     pop rbp
-- 
cgit v1.2.3


From 6d217e1010f3aed73a86d5dff2f3f46a3ca60cb5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 16:27:26 +0200
Subject: fix build with JIT disabled and set default JIT maxblock size to 32

---
 src/ARM.cpp        | 2 ++
 src/CMakeLists.txt | 2 +-
 src/CP15.cpp       | 4 ++++
 src/Config.cpp     | 4 ++--
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 3eac74d..6b8df30 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -173,6 +173,7 @@ void ARM::DoSavestate(Savestate* file)
     file->VarArray(R_IRQ, 3*sizeof(u32));
     file->VarArray(R_UND, 3*sizeof(u32));
     file->Var32(&CurInstr);
+#ifdef JIT_ENABLED
     if (!file->Saving && Config::JIT_Enable)
     {
         // hack, the JIT doesn't really pipeline
@@ -180,6 +181,7 @@ void ARM::DoSavestate(Savestate* file)
         // loaded while running the interpreter
         FillPipeline();
     }
+#endif
     file->VarArray(NextInstr, 2*sizeof(u32));
 
     file->Var32(&ExceptionBase);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a0c3a36..0029e96 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,7 +11,6 @@ add_library(core STATIC
 	ARMInterpreter_ALU.cpp
 	ARMInterpreter_Branch.cpp
 	ARMInterpreter_LoadStore.cpp
-	ARM_InstrInfo.cpp
 	Config.cpp
 	CP15.cpp
 	CRC32.cpp
@@ -57,6 +56,7 @@ if (ENABLE_JIT)
 
 	target_sources(core PRIVATE
 		ARMJIT.cpp
+		ARM_InstrInfo.cpp
 
 		dolphin/CommonFuncs.cpp
 	)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index e168d7f..ff8531c 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -562,11 +562,15 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+#ifdef JIT_ENABLED
         ARMJIT::InvalidateAll();
+#endif
         ICacheInvalidateAll();
         return;
     case 0x751:
+#ifdef JIT_ENABLED
         ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
+#endif
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
diff --git a/src/Config.cpp b/src/Config.cpp
index e69319b..c0ec4ec 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -39,7 +39,7 @@ char DSiNANDPath[1024];
 
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
-int JIT_MaxBlockSize = 12;
+int JIT_MaxBlockSize = 32;
 int JIT_BrancheOptimisations = 2;
 int JIT_LiteralOptimisations = true;
 #endif
@@ -57,7 +57,7 @@ ConfigEntry ConfigFile[] =
 
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
-    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
     {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
-- 
cgit v1.2.3


From 5a3607bc688b42cc1da886bd2afc58d7aa4733be Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 20:47:36 +0200
Subject: don't use param registers for ReadBanked/WriteBanked should fix linux
 build

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 64 ++++++++++++++++++-------------------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  1 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++-----
 3 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 52a16dc..8d20425 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -48,10 +48,10 @@ void Compiler::A_Comp_MRS()
     {
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
-        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(ReadBanked);
-        MOV(32, rd, R(ABI_PARAM3));
+        MOV(32, rd, R(RSCRATCH3));
     }
     else
         MOV(32, rd, R(RCPSR));
@@ -75,28 +75,26 @@ void Compiler::A_Comp_MSR()
     {
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
-        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(ReadBanked);
 
-        MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00));
-        MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF));
+        MOV(32, R(RSCRATCH2), Imm32(mask));
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00));
         MOV(32, R(RSCRATCH), R(RCPSR));
         AND(32, R(RSCRATCH), Imm8(0x1F));
         CMP(32, R(RSCRATCH), Imm8(0x10));
-        CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE);
-        AND(32, R(RSCRATCH2), Imm32(mask));
+        CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E);
 
-        MOV(32, R(RSCRATCH), R(RSCRATCH2));
-        NOT(32, R(RSCRATCH));
-        AND(32, R(ABI_PARAM3), R(RSCRATCH));
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        NOT(32, R(RSCRATCH4));
+        AND(32, R(RSCRATCH3), R(RSCRATCH4));
 
         AND(32, R(RSCRATCH2), val);
-        OR(32, R(ABI_PARAM3), R(RSCRATCH2));
+        OR(32, R(RSCRATCH3), R(RSCRATCH2));
 
-        MOV(32, R(RSCRATCH), R(RCPSR));
-        AND(32, R(RSCRATCH), Imm8(0x1F));
-        MOV(32, R(ABI_PARAM2), Imm32(15 - 8));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
         CALL(WriteBanked);
     }
     else
@@ -219,13 +217,13 @@ Compiler::Compiler()
 
     {
         // RSCRATCH mode
-        // ABI_PARAM2 reg number
-        // ABI_PARAM3 value in current mode
-        // ret - ABI_PARAM3
+        // RSCRATCH2 reg number
+        // RSCRATCH3 value in current mode
+        // ret - RSCRATCH3
         ReadBanked = (void*)GetWritableCodePtr();
         CMP(32, R(RSCRATCH), Imm8(0x11));
         FixupBranch fiq = J_CC(CC_E);
-        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
         FixupBranch notEverything = J_CC(CC_L);
         CMP(32, R(RSCRATCH), Imm8(0x12));
         FixupBranch irq = J_CC(CC_E);
@@ -239,30 +237,30 @@ Compiler::Compiler()
         RET();
 
         SetJumpTarget(fiq);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)));
         RET();
         SetJumpTarget(irq);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)));
         RET();
         SetJumpTarget(svc);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)));
         RET();
         SetJumpTarget(abt);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)));
         RET();
         SetJumpTarget(und);
-        MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
     }
     {
         // RSCRATCH  mode
-        // ABI_PARAM2 reg n
-        // ABI_PARAM3 value
+        // RSCRATCH2 reg n
+        // RSCRATCH3 value
         // carry flag set if the register isn't banked
         WriteBanked = (void*)GetWritableCodePtr();
         CMP(32, R(RSCRATCH), Imm8(0x11));
         FixupBranch fiq = J_CC(CC_E);
-        SUB(32, R(ABI_PARAM2), Imm8(13 - 8));
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
         FixupBranch notEverything = J_CC(CC_L);
         CMP(32, R(RSCRATCH), Imm8(0x12));
         FixupBranch irq = J_CC(CC_E);
@@ -277,23 +275,23 @@ Compiler::Compiler()
         RET();
 
         SetJumpTarget(fiq);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(irq);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(svc);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(abt);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3));
         CLC();
         RET();
         SetJumpTarget(und);
-        MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3));
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3));
         CLC();
         RET();
     }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2230eb8..e0a4978 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -16,6 +16,7 @@ const Gen::X64Reg RCPSR = Gen::R15;
 const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+const Gen::X64Reg RSCRATCH4 = Gen::R8;
 
 struct ComplexOperand
 {
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 7f6fa53..85a3737 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -540,14 +540,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                         AND(32, R(RSCRATCH), Imm8(0x1F));
                         firstUserMode = false;
                     }
-                    MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
-                    POP(ABI_PARAM3);
+                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                    POP(RSCRATCH3);
                     CALL(WriteBanked);
                     FixupBranch sucessfulWritten = J_CC(CC_NC);
                     if (RegCache.Mapping[reg] != INVALID_REG)
-                        MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3));
+                        MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
                     else
-                        SaveReg(reg, ABI_PARAM3);
+                        SaveReg(reg, RSCRATCH3);
                     SetJumpTarget(sucessfulWritten);
                 }
                 else if (RegCache.Mapping[reg] == INVALID_REG)
@@ -600,12 +600,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     firstUserMode = false;
                 }
                 if (RegCache.Mapping[reg] == INVALID_REG)
-                    LoadReg(reg, ABI_PARAM3);
+                    LoadReg(reg, RSCRATCH3);
                 else
-                    MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg]));
-                MOV(32, R(ABI_PARAM2), Imm32(reg - 8));
+                    MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
+                MOV(32, R(RSCRATCH2), Imm32(reg - 8));
                 CALL(ReadBanked);
-                PUSH(ABI_PARAM3);
+                PUSH(RSCRATCH3);
             }
             else if (RegCache.Mapping[reg] == INVALID_REG)
             {
-- 
cgit v1.2.3


From b902cd1b8e0b5509f108e2b60ded3ec38b1c53fc Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 26 Apr 2020 23:25:32 +0200
Subject: fix regression from last commit also a small mistake with msr

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  2 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 8d20425..dd20e3c 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -105,7 +105,7 @@ void Compiler::A_Comp_MSR()
         if ((mask & 0xFF) == 0)
         {
             AND(32, R(RCPSR), Imm32(~mask));
-            if (val.IsImm())
+            if (!val.IsImm())
             {
                 MOV(32, R(RSCRATCH), val);
                 AND(32, R(RSCRATCH), Imm32(mask));
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 85a3737..b595e32 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -502,14 +502,6 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     int regsCount = regs.Count();
 
-    if (decrement)
-    {
-        MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-        preinc ^= true;
-    }
-    else
-        MOV(32, R(ABI_PARAM1), MapReg(rn));
-
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
     // we need to make sure that the stack stays aligned to 16 bytes
@@ -519,6 +511,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     {
         Comp_AddCycles_CDI();
 
+        if (decrement)
+        {
+            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
+            preinc ^= true;
+        }
+        else
+            MOV(32, R(ABI_PARAM1), MapReg(rn));
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -618,6 +618,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
         }
 
+        if (decrement)
+        {
+            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
+            preinc ^= true;
+        }
+        else
+            MOV(32, R(ABI_PARAM1), MapReg(rn));
+
         MOV(64, R(ABI_PARAM2), R(RSP));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
 
-- 
cgit v1.2.3


From 052ff7367211728209d6eb5f8f0f6d02cfab321e Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 9 May 2020 00:45:05 +0200
Subject: rewrite JIT memory emulation

---
 src/ARM.cpp                         |  10 +-
 src/ARM.h                           |  24 +-
 src/ARMJIT.cpp                      | 905 +++++++++++++++++++++++++---------
 src/ARMJIT.h                        |  65 ++-
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  |   4 +-
 src/ARMJIT_Internal.h               |  68 ++-
 src/ARMJIT_RegisterCache.h          |  18 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  34 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 935 +++++++++++++++++++-----------------
 src/ARM_InstrInfo.cpp               |  16 +-
 src/CP15.cpp                        |  44 +-
 src/NDS.cpp                         | 105 +++-
 src/NDS.h                           |   9 +-
 14 files changed, 1465 insertions(+), 815 deletions(-)

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 6b8df30..92a3a9e 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -622,7 +622,8 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped<0>(instrAddr))
+        u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr);
+        if (!translatedAddr)
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
@@ -632,7 +633,7 @@ void ARMv5::ExecuteJIT()
         // hack so Cycles <= 0 becomes Cycles < 0
         Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
@@ -765,7 +766,8 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        if (!ARMJIT::IsMapped<1>(instrAddr))
+        u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr);
+        if (!translatedAddr)
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
@@ -774,7 +776,7 @@ void ARMv4::ExecuteJIT()
 
         Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
diff --git a/src/ARM.h b/src/ARM.h
index b71102a..b1e8053 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -320,7 +320,7 @@ public:
     void DataRead8(u32 addr, u32* val)
     {
         *val = BusRead8(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -329,7 +329,7 @@ public:
         addr &= ~1;
 
         *val = BusRead16(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -338,7 +338,7 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -353,7 +353,7 @@ public:
     void DataWrite8(u32 addr, u8 val)
     {
         BusWrite8(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -362,7 +362,7 @@ public:
         addr &= ~1;
 
         BusWrite16(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
@@ -371,7 +371,7 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataRegion = addr >> 20;
+        DataRegion = addr;
         DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
@@ -402,7 +402,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if ((DataRegion >> 4) == 0x02) // mainRAM
+        if ((DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 Cycles -= numC + numD;
@@ -429,7 +429,7 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if ((DataRegion >> 4) == 0x02)
+        if ((DataRegion >> 24) == 0x02)
         {
             if (CodeRegion == 0x02)
                 Cycles -= numC + numD;
@@ -455,4 +455,12 @@ void T_UNK(ARM* cpu);
 
 }
 
+namespace NDS
+{
+
+extern ARMv5* ARM9;
+extern ARMv4* ARM7;
+
+}
+
 #endif // ARM_H
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 46f71f1..9602aed 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -23,6 +23,7 @@
 #include "ARMInterpreter_Branch.h"
 #include "ARMInterpreter.h"
 
+#include "GPU.h"
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
@@ -34,9 +35,10 @@ namespace ARMJIT
 #define JIT_DEBUGPRINT(msg, ...)
 //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
-Compiler* compiler;
+Compiler* JITCompiler;
 
-const u32 ExeMemRegionSizes[] = {
+const u32 ExeMemRegionSizes[] =
+{
 	0x8000,			// Unmapped Region (dummy)
 	0x8000, 		// ITCM
 	4*1024*1024, 	// Main RAM
@@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = {
 	0x40000			// ARM7 WVRAM
 };
 
-const u32 ExeMemRegionOffsets[] = {
+const u32 ExeMemRegionOffsets[] =
+{
 	0,
 	0x8000,
 	0x10000,
@@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = {
 	0x518000,
 };
 
-#define DUP2(x) x, x
-
-const static ExeMemKind JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(exeMem_ITCM),
-		/* 1X*/	DUP2(exeMem_ITCM), // mirror
-		/* 2X*/	DUP2(exeMem_MainRAM),
-		/* 3X*/	DUP2(exeMem_SWRAM),
-		/* 4X*/	DUP2(exeMem_Unmapped),
-		/* 5X*/	DUP2(exeMem_Unmapped),
-		/* 6X*/		 exeMem_Unmapped, 
-					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(exeMem_Unmapped),
-		/* 8X*/	DUP2(exeMem_Unmapped),
-		/* 9X*/	DUP2(exeMem_Unmapped),
-		/* AX*/	DUP2(exeMem_Unmapped),
-		/* BX*/	DUP2(exeMem_Unmapped),
-		/* CX*/	DUP2(exeMem_Unmapped),
-		/* DX*/	DUP2(exeMem_Unmapped),
-		/* EX*/	DUP2(exeMem_Unmapped),
-		/* FX*/	DUP2(exeMem_ARM9_BIOS)
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
-		/* 1X*/	DUP2(exeMem_Unmapped),
-		/* 2X*/	DUP2(exeMem_MainRAM),
-		/* 3X*/	     exeMem_SWRAM,
-		             exeMem_ARM7_WRAM,
-		/* 4X*/	DUP2(exeMem_Unmapped),
-		/* 5X*/	DUP2(exeMem_Unmapped),
-		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(exeMem_Unmapped),
-		/* 8X*/	DUP2(exeMem_Unmapped),
-		/* 9X*/	DUP2(exeMem_Unmapped),
-		/* AX*/	DUP2(exeMem_Unmapped),
-		/* BX*/	DUP2(exeMem_Unmapped),
-		/* CX*/	DUP2(exeMem_Unmapped),
-		/* DX*/	DUP2(exeMem_Unmapped),
-		/* EX*/	DUP2(exeMem_Unmapped),
-		/* FX*/	DUP2(exeMem_Unmapped)
-		}
-};
-
-#undef DUP2
-
 /*
 	translates address to pseudo physical address
 		- more compact, eliminates mirroring, everything comes in a row
 		- we only need one translation table
 */
-u32 AddrTranslate9[0x2000];
-u32 AddrTranslate7[0x4000];
+
+u32 TranslateAddr9(u32 addr)
+{
+	switch (ClassifyAddress9(addr))
+	{
+	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
+	case memregion_SWRAM9:
+		if (NDS::SWRAM_ARM9)
+			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask);
+		else
+			return 0;
+	case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF);
+	case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0;
+	case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF);
+	default: return 0;
+	}
+}
+
+u32 TranslateAddr7(u32 addr)
+{
+	switch (ClassifyAddress7(addr))
+	{
+	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
+	case memregion_SWRAM7:
+		if (NDS::SWRAM_ARM7)
+			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask);
+		else
+			return 0;
+	case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr;
+	case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF);
+	case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF);
+	default: return 0;
+	}
+}
 
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-std::unordered_map<u32, JitBlock*> JitBlocks;
+TinyVector<u32> InvalidLiterals;
+
+std::unordered_map<u32, JitBlock*> JitBlocks9;
+std::unordered_map<u32, JitBlock*> JitBlocks7;
+
+u8 MemoryStatus9[0x800000];
+u8 MemoryStatus7[0x800000];
+
+int ClassifyAddress9(u32 addr)
+{
+	if (addr < NDS::ARM9->ITCMSize)
+		return memregion_ITCM;
+	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+		return memregion_DTCM;
+	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		return memregion_BIOS9;
+	else
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x02000000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			return memregion_SWRAM9;
+		case 0x04000000:
+			return memregion_IO9;
+		case 0x06000000:
+			return memregion_VRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+int ClassifyAddress7(u32 addr)
+{
+	if (addr < 0x00004000)
+		return memregion_BIOS7;
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x02000000:
+		case 0x02800000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM7)
+				return memregion_SWRAM7;
+			else
+				return memregion_WRAM7;
+		case 0x03800000:
+			return memregion_WRAM7;
+		case 0x04000000:
+			return memregion_IO7;
+		case 0x04800000:
+			return memregion_Wifi;
+		case 0x06000000:
+		case 0x06800000:
+			return memregion_VWRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+void UpdateMemoryStatus9(u32 start, u32 end)
+{
+	start >>= 12;
+	end >>= 12;
+
+	if (end == 0xFFFFF)
+		end++;
+
+	for (u32 i = start; i < end; i++)
+	{
+		u32 addr = i << 12;
+
+		int region = ClassifyAddress9(addr);
+		u32 pseudoPhyisical = TranslateAddr9(addr);
+
+		for (u32 j = 0; j < 8; j++)
+		{
+			u8 val = region;
+			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
+				val |= 0x80;
+			MemoryStatus9[i * 8 + j] = val;
+		}
+	}
+}
+
+void UpdateMemoryStatus7(u32 start, u32 end)
+{
+	start >>= 12;
+	end >>= 12;
+
+	if (end == 0xFFFFF)
+		end++;
+
+	for (u32 i = start; i < end; i++)
+	{
+		u32 addr = i << 12;
+
+		int region = ClassifyAddress7(addr);
+		u32 pseudoPhyisical = TranslateAddr7(addr);
+
+		for (u32 j = 0; j < 8; j++)
+		{
+			u8 val = region;
+			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
+				val |= 0x80;
+			MemoryStatus7[i * 8 + j] = val;
+		}
+	}
+}
+
+void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate)
+{
+	for (u32 i = 1; i < exeMem_Count; i++)
+	{
+		if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i])
+		{
+			for (u32 num = 0; num < 2; num++)
+			{
+				u32 physSize = ExeMemRegionSizes[i];
+				u32 mapSize = 0;
+				u32 mapStart = 0;
+				switch (i)
+				{
+				case exeMem_ITCM:
+					if (num == 0)
+						mapStart = 0; mapSize = NDS::ARM9->ITCMSize;
+					break;
+				case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break;
+				case exeMem_SWRAM:
+					if (num == 0)
+					{
+						if (NDS::SWRAM_ARM9)
+							mapStart = 0x3000000, mapSize = 0x1000000;
+						else
+							mapStart = mapSize = 0;
+					}
+					else
+					{
+						if (NDS::SWRAM_ARM7)
+							mapStart = 0x3000000, mapSize = 0x800000;
+						else
+							mapStart = mapSize = 0;
+					}
+					break;
+				case exeMem_LCDC:
+					if (num == 0)
+						mapStart = 0x6800000, mapSize = 0xA4000;
+					break;
+				case exeMem_ARM9_BIOS:
+					if (num == 0)
+						mapStart = 0xFFFF0000, mapSize = 0x10000;
+					break;
+				case exeMem_ARM7_BIOS:
+					if (num == 1)
+						mapStart = 0; mapSize = 0x4000;
+					break;
+				case exeMem_ARM7_WRAM:
+					if (num == 1)
+					{
+						if (NDS::SWRAM_ARM7)
+							mapStart = 0x3800000, mapSize = 0x800000;
+						else
+							mapStart = 0x3000000, mapSize = 0x1000000;
+					}
+					break;
+				case exeMem_ARM7_WVRAM:
+					if (num == 1)
+						mapStart = 0x6000000, mapSize = 0x1000000;
+					break;
+				}
+
+				for (u32 j = 0; j < mapSize / physSize; j++)
+				{
+					u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]);
+					if (num == 0
+						&& virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+						continue;
+					if (invalidate)
+					{
+						if (num == 0)
+							MemoryStatus9[virtAddr / 512] |= 0x80;
+						else
+							MemoryStatus7[virtAddr / 512] |= 0x80;
+					}
+					else
+					{
+						if (num == 0)
+							MemoryStatus9[virtAddr / 512] &= ~0x80;
+						else
+							MemoryStatus7[virtAddr / 512] &= ~0x80;
+					}
+				}
+				
+			}
+			return;
+		}
+	}
+
+	assert(false);
+}
+
+template <typename T>
+T SlowRead9(ARMv5* cpu, u32 addr)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (addr < cpu->ITCMSize)
+		val = *(T*)&cpu->ITCM[addr & 0x7FFF];
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+		val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF];
+	else if (std::is_same<T, u32>::value)
+		val = NDS::ARM9Read32(addr);
+	else if (std::is_same<T, u16>::value)
+		val = NDS::ARM9Read16(addr);
+	else
+		val = NDS::ARM9Read8(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T>
+void SlowWrite9(ARMv5* cpu, u32 addr, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+    if (addr < cpu->ITCMSize)
+	{
+		InvalidateITCMIfNecessary(addr);
+		*(T*)&cpu->ITCM[addr & 0x7FFF] = val;
+	}
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+	{
+		*(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val;
+	}
+	else if (std::is_same<T, u32>::value)
+	{
+		NDS::ARM9Write32(addr, val);
+	}
+	else if (std::is_same<T, u16>::value)
+	{
+		NDS::ARM9Write16(addr, val);
+	}
+	else
+	{
+		NDS::ARM9Write8(addr, val);
+	}
+}
+
+template void SlowWrite9<u32>(ARMv5*, u32, u32);
+template void SlowWrite9<u16>(ARMv5*, u32, u16);
+template void SlowWrite9<u8>(ARMv5*, u32, u8);
+
+template u32 SlowRead9<u32>(ARMv5*, u32);
+template u16 SlowRead9<u16>(ARMv5*, u32);
+template u8 SlowRead9<u8>(ARMv5*, u32);
+
+template <typename T>
+T SlowRead7(u32 addr)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (std::is_same<T, u32>::value)
+		val = NDS::ARM7Read32(addr);
+	else if (std::is_same<T, u16>::value)
+		val = NDS::ARM7Read16(addr);
+	else
+		val = NDS::ARM7Read8(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T>
+void SlowWrite7(u32 addr, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+	if (std::is_same<T, u32>::value)
+		NDS::ARM7Write32(addr, val);
+	else if (std::is_same<T, u16>::value)
+		NDS::ARM7Write16(addr, val);
+	else
+		NDS::ARM7Write8(addr, val);
+}
+
+template <bool PreInc, bool Write>
+void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		addr += PreInc * 4;
+		if (Write)
+			SlowWrite9<u32>(cpu, addr, data[i]);
+		else
+			data[i] = SlowRead9<u32>(cpu, addr);
+		addr += !PreInc * 4;
+	}
+}
+
+template <bool PreInc, bool Write>
+void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		addr += PreInc * 4;
+		if (Write)
+			SlowWrite7<u32>(addr, data[i]);
+		else
+			data[i] = SlowRead7<u32>(addr);
+		addr += !PreInc * 4;
+	}
+}
+
+template void SlowWrite7<u32>(u32, u32);
+template void SlowWrite7<u16>(u32, u16);
+template void SlowWrite7<u8>(u32, u8);
+
+template u32 SlowRead7<u32>(u32);
+template u16 SlowRead7<u16>(u32);
+template u8 SlowRead7<u8>(u32);
+
+template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*);
+template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num);
+template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num);
 
 template <typename K, typename V, int Size, V InvalidValue>
 struct UnreliableHashTable
@@ -211,31 +540,25 @@ struct UnreliableHashTable
 };
 
 UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
-UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
+UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9;
+UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7;
 
 void Init()
 {
-	for (int i = 0; i < 0x2000; i++)
-	{
-		ExeMemKind kind = JIT_MEM[0][i >> 8];
-		u32 size = ExeMemRegionSizes[kind];
-
-		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
-	}
-	for (int i = 0; i < 0x4000; i++)
-	{
-		ExeMemKind kind = JIT_MEM[1][i >> 9];
-		u32 size = ExeMemRegionSizes[kind];
-
-		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
-	}
-
-	compiler = new Compiler();
+	JITCompiler = new Compiler();
 }
 
 void DeInit()
 {
-	delete compiler;
+	delete JITCompiler;
+}
+
+void Reset()
+{
+	ResetBlockCache();
+
+	UpdateMemoryStatus9(0, 0xFFFFFFFF);
+	UpdateMemoryStatus7(0, 0xFFFFFFFF);
 }
 
 void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
@@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeLiteral(const FetchedInstr& instr, u32& addr)
+bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr)
 {
-	switch (instr.Info.Kind)
+	if (!thumb)
 	{
-	case ARMInstrInfo::ak_STR_IMM:
-	case ARMInstrInfo::ak_STRB_IMM:
-		addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
-		return true;
-	case ARMInstrInfo::ak_STRD_IMM:
-	case ARMInstrInfo::ak_STRH_IMM:
-		addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
-		return true;
-	case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever}
-		addr = instr.Addr + 8;
+		switch (instr.Info.Kind)
+		{
+		case ARMInstrInfo::ak_LDR_IMM:
+		case ARMInstrInfo::ak_LDRB_IMM:
+			addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		case ARMInstrInfo::ak_LDRH_IMM:
+			addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		default:
+			break;
+		}
+	}
+	else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL)
+	{
+    	addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2);
 		return true;
-	default:
-		JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr);
-		return false;
 	}
+
+	JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind);
+	return false;
 }
 
 bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
@@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 };
 #undef F
 
+
+extern u32 literalsPerBlock;
 void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu)
 		Config::JIT_MaxBlockSize = 32;
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
-    if (!(cpu->Num == 0 
-        ? IsMapped<0>(blockAddr)
-        : IsMapped<1>(blockAddr)))
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr9(blockAddr)
+			: TranslateAddr7(blockAddr);
+    if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped])
     {
         printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
     }
 	
-	u32 pseudoPhysicalAddr = cpu->Num == 0
-			? TranslateAddr<0>(blockAddr)
-			: TranslateAddr<1>(blockAddr);
-
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
     u32 r15 = cpu->R[15];
 
-	u32 addresseRanges[32] = {};
+	u32 addressRanges[Config::JIT_MaxBlockSize];
+	u32 addressMasks[Config::JIT_MaxBlockSize] = {0};
 	u32 numAddressRanges = 0;
 
+	u32 numLiterals = 0;
+	u32 literalLoadAddrs[Config::JIT_MaxBlockSize];
+	// they are going to be hashed
+	u32 literalValues[Config::JIT_MaxBlockSize];
+	u32 instrValues[Config::JIT_MaxBlockSize];
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
-		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
+	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr);
 
 	u32 lastSegmentStart = blockAddr;
 	u32 lr;
@@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu)
 		nextInstrAddr[1] = r15;
 		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
 
-		u32 translatedAddr = (cpu->Num == 0
-			? TranslateAddr<0>(instrs[i].Addr)
-			: TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF;
-		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		instrValues[i] = instrs[i].Instr;
+
+		u32 translatedAddr = cpu->Num == 0
+			? TranslateAddr9(instrs[i].Addr)
+			: TranslateAddr7(instrs[i].Addr);
+		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
 			bool returning = false;
 			for (int j = 0; j < numAddressRanges; j++)
 			{
-				if (addresseRanges[j] == translatedAddr)
+				if (addressRanges[j] == translatedAddrRounded)
 				{
+					std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]);
+					std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]);
 					returning = true;
 					break;
 				}
 			}
 			if (!returning)
-				addresseRanges[numAddressRanges++] = translatedAddr;
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
 		}
+		addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16);
 
         if (cpu->Num == 0)
         {
@@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu)
                 u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
 				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode]
 					|| instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM
-					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop);
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_UNK);
 				if (cpu->CheckCondition(instrs[i].Cond()))
 					InterpretARM[instrs[i].Info.Kind](cpu);
 				else
@@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu)
 		instrs[i].DataCycles = cpu->DataCycles;
 		instrs[i].DataRegion = cpu->DataRegion;
 
-		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem 
-			&& instrs[i].Info.SrcRegs == (1 << 15)
-			&& instrs[i].Info.DstRegs == 0)
+		u32 literalAddr;
+		if (Config::JIT_LiteralOptimisations
+			&& instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral
+			&& DecodeLiteral(thumb, instrs[i], literalAddr))
 		{
-			assert (!thumb);
-
-			u32 addr;
-			if (DecodeLiteral(instrs[i], addr))
-			{
-				JIT_DEBUGPRINT("pc relative write detected\n");
-				u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
-
-				ARMJIT::InvalidateByAddr(translatedAddr, false);
-				CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16));
-			}
+			u32 translatedAddr = cpu->Num == 0
+				? TranslateAddr9(literalAddr)
+				: TranslateAddr7(literalAddr);
+			u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+
+			u32 j = 0;
+			for (; j < numAddressRanges; j++)
+				if (addressRanges[j] == translatedAddrRounded)
+					break;
+			if (j == numAddressRanges)
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
+			addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16);
+			JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]);
+			cpu->DataRead32(literalAddr, &literalValues[numLiterals]);
+			literalLoadAddrs[numLiterals++] = translatedAddr;
 		}
 
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
@@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu)
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
-						? TranslateAddr<0>(target)
-						: TranslateAddr<1>(target);
+						? TranslateAddr9(target)
+						: TranslateAddr7(target);
 
 					if (link)
 					{
@@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu)
 
         i++;
 
-		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
+		bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
+	u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4);
+	u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4);
+
 	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
 	if (prevBlock)
 	{
 		RestoreCandidates.Remove(pseudoPhysicalAddr);
-		if (prevBlock->NumInstrs == i)
-		{
-			for (int j = 0; j < i; j++)
-			{
-				if (prevBlock->Instrs()[j] != instrs[j].Instr)
-				{
-					mayRestore = false;
-					break;
-				}
-			}
-		}
-		else
-			mayRestore = false;
 
-		if (prevBlock->NumAddresses == numAddressRanges)
+		mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash;
+
+		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
 			for (int j = 0; j < numAddressRanges; j++)
 			{
-				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				if (prevBlock->AddressRanges()[j] != addressRanges[j]
+					|| prevBlock->AddressMasks()[j] != addressMasks[j])
 				{
 					mayRestore = false;
 					break;
@@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu)
 		if (prevBlock)
 			delete prevBlock;
 
-		block = new JitBlock(i, numAddressRanges);
-		for (int j = 0; j < i; j++)
-			block->Instrs()[j] = instrs[j].Instr;
+		block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals);
+		block->LiteralHash = literalHash;
+		block->InstrHash = instrHash;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addressRanges[j];
 		for (int j = 0; j < numAddressRanges; j++)
-			block->AddressRanges()[j] = addresseRanges[j];
+			block->AddressMasks()[j] = addressMasks[j];
+		for (int j = 0; j < numLiterals; j++)
+			block->Literals()[j] = literalLoadAddrs[j];
 
-		block->StartAddr = blockAddr;
 		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
+		block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu)
 
 	for (int j = 0; j < numAddressRanges; j++)
 	{
-		assert(addresseRanges[j] == block->AddressRanges()[j]);
-		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
+		assert(addressRanges[j] == block->AddressRanges()[j]);
+		assert(addressMasks[j] == block->AddressMasks()[j]);
+		assert(addressMasks[j] != 0);
+		CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j];
+		CodeRanges[addressRanges[j] / 512].Blocks.Add(block);
+
+		UpdateRegionByPseudoPhyiscal(addressRanges[j], true);
 	}
 
-	JitBlocks[pseudoPhysicalAddr] = block;
-	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
+	if (cpu->Num == 0)
+	{
+		JitBlocks9[pseudoPhysicalAddr] = block;
+		FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
+	}
+	else
+	{
+		JitBlocks7[pseudoPhysicalAddr] = block;
+		FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
+	}
 }
 
-void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
+void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
-	int startLength = range->Blocks.Length;
-	for (int i = 0; i < range->Blocks.Length; i++)
+	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
+
+	range->Code = 0;
+	for (int i = 0; i < range->Blocks.Length;)
 	{
-		assert(range->Blocks.Length == startLength);
 		JitBlock* block = range->Blocks[i];
+
+		bool invalidated = false;
+		u32 mask = 0;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF))
+			{
+				mask = block->AddressMasks()[j];
+				invalidated = block->AddressMasks()[j] & mask;
+				break;
+			}
+		}
+		assert(mask);
+		if (!invalidated)
+		{
+			range->Code |= mask;
+			i++;
+			continue;
+		}
+		range->Blocks.Remove(i);
+
+		bool literalInvalidation = false;
+		for (int j = 0; j < block->NumLiterals; j++)
+		{
+			u32 addr = block->Literals()[j];
+			if (addr == pseudoPhysical)
+			{
+				if (InvalidLiterals.Find(pseudoPhysical) != -1)
+				{
+					InvalidLiterals.Add(pseudoPhysical);
+					JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length);
+				}
+				literalInvalidation = true;
+				break;
+			}
+		}
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 				assert(otherRange != range);
 				bool removed = otherRange->Blocks.RemoveByValue(block);
 				assert(removed);
+
+				if (otherRange->Blocks.Length == 0)
+				{
+					otherRange->Code = 0;
+					UpdateRegionByPseudoPhyiscal(addr, false);
+				}
 			}
 		}
 
 		for (int j = 0; j < block->NumLinks(); j++)
-			compiler->UnlinkBlock(block->Links()[j]);
+			JITCompiler->UnlinkBlock(block->Links()[j]);
+		block->ResetLinks();
 
-		JitBlocks.erase(block->PseudoPhysicalAddr);
-		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+		if (block->Num == 0)
+		{
+			JitBlocks9.erase(block->PseudoPhysicalAddr);
+			FastBlockLookUp9.Remove(block->PseudoPhysicalAddr);
+		}
+		else
+		{
+			JitBlocks7.erase(block->PseudoPhysicalAddr);
+			FastBlockLookUp7.Remove(block->PseudoPhysicalAddr);
+		}
 
-		if (mayRestore)
+		if (!literalInvalidation)
 		{
 			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
 			if (prevBlock)
 				delete prevBlock;
 		}
+		else
+		{
+			delete block;
+		}
 	}
-	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
-		range->TimesInvalidated++;
-	
-	range->Blocks.Clear();
-}
 
-void InvalidateByAddr7(u32 addr)
-{
-	u32 pseudoPhysical = TranslateAddr<1>(addr);
-	if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false))
-		InvalidateByAddr(pseudoPhysical);
+	if (range->Blocks.Length == 0)
+		UpdateRegionByPseudoPhyiscal(pseudoPhysical, false);
 }
 
-void InvalidateITCM(u32 addr)
+void InvalidateRegionIfNecessary(u32 pseudoPhyisical)
 {
-	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
-	if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0)
-		InvalidateByAddr(pseudoPhysical);
-}
-
-void InvalidateAll()
-{
-	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size());
-	for (auto it : JitBlocks)
-	{
-		JitBlock* block = it.second;
-
-		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
-
-		for (int i = 0; i < block->NumAddresses; i++)
-		{
-			u32 addr = block->AddressRanges()[i];
-			AddressRange* range = &CodeRanges[addr / 512];
-			range->Blocks.Clear();
-			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
-				range->TimesInvalidated++;
-		}
-		for (int i = 0; i < block->NumLinks(); i++)
-			compiler->UnlinkBlock(block->Links()[i]);
-		block->ResetLinks();
-
-		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
-		if (prevBlock)
-			delete prevBlock;
-	}
-
-	JitBlocks.clear();
+	if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16)))
+		InvalidateByAddr(pseudoPhyisical);
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
 
-	FastBlockLookUp.Reset();
+	InvalidLiterals.Clear();
+	FastBlockLookUp9.Reset();
+	FastBlockLookUp7.Reset();
 	RestoreCandidates.Reset();
 	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
@@ -870,61 +1244,119 @@ void ResetBlockCache()
 			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (auto it : JitBlocks)
+	for (auto it : JitBlocks9)
 	{
 		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
 			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].TimesInvalidated = 0;
-			CodeRanges[addr / 512].InvalidLiterals = 0;
+			CodeRanges[addr / 512].Code = 0;
 		}
 		delete block;
 	}
-	JitBlocks.clear();
+	for (auto it : JitBlocks7)
+	{
+		JitBlock* block = it.second;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 512].Blocks.Clear();
+			CodeRanges[addr / 512].Code = 0;
+		}
+	}
+	JitBlocks9.clear();
+	JitBlocks7.clear();
 
-	compiler->Reset();
+	JITCompiler->Reset();
 }
 
+template <u32 Num>
 JitBlockEntry LookUpBlockEntry(u32 addr)
 {
-	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7;
+	u32 entryOffset = fastMap.LookUp(addr);
 	if (entryOffset != UINT32_MAX)
-		return compiler->AddEntryOffset(entryOffset);
+		return JITCompiler->AddEntryOffset(entryOffset);
 
-	auto block = JitBlocks.find(addr);
-	if (block != JitBlocks.end())
+	auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7;
+	auto block = slowMap.find(addr);
+	if (block != slowMap.end())
 	{
-		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint));
 		return block->second->EntryPoint;
 	}
 	return NULL;
 }
 
+template JitBlockEntry LookUpBlockEntry<0>(u32);
+template JitBlockEntry LookUpBlockEntry<1>(u32);
+
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset)
 {
-	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
-	auto block = JitBlocks.find(targetPseudoPhys);
-	if (block == JitBlocks.end())
+	auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7;
+	u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4);
+	u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr);
+	auto block = blockMap.find(targetPseudoPhys);
+	if (block == blockMap.end())
 	{
 		CompileBlock(cpu);
-		block = JitBlocks.find(targetPseudoPhys);
+		block = blockMap.find(targetPseudoPhys);
 	}
 
 	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
 
 	block->second->AddLink(codeOffset);
-	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+	JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
+template void LinkBlock<0>(ARM*, u32);
+template void LinkBlock<1>(ARM*, u32);
+
+void WifiWrite32(u32 addr, u32 val)
+{
+	Wifi::Write(addr, val & 0xFFFF);
+	Wifi::Write(addr + 2, val >> 16);
+}
+
+u32 WifiRead32(u32 addr)
+{
+	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
+}
+
+template <typename T>
+void VRAMWrite(u32 addr, T val)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
+	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
+	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
+	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
+	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
+	}
+}
+template <typename T>
+T VRAMRead(u32 addr)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
+	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
+	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
+	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
+	default: return GPU::ReadVRAM_LCDC<T>(addr);
+	}
 }
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
 	{
-		if ((addr & 0xFF000000) == 0x04000000)
+		switch (addr & 0xFF000000)
 		{
+		case 0x04000000:
 			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
 				return (void*)NDSCart::ReadROMData;
 
@@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 
 			switch (size | store)
 			{
-			case 8: return (void*)NDS::ARM9IORead8;		
-			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 8: return (void*)NDS::ARM9IORead8;
+			case 9: return (void*)NDS::ARM9IOWrite8;
 			case 16: return (void*)NDS::ARM9IORead16;
 			case 17: return (void*)NDS::ARM9IOWrite16;
 			case 32: return (void*)NDS::ARM9IORead32;
 			case 33: return (void*)NDS::ARM9IOWrite32;
 			}
+			break;
+		case 0x06000000:
+			switch (size | store)
+			{
+			case 8: return (void*)VRAMRead<u8>;		
+			case 9: return NULL;
+			case 16: return (void*)VRAMRead<u16>;
+			case 17: return (void*)VRAMWrite<u16>;
+			case 32: return (void*)VRAMRead<u32>;
+			case 33: return (void*)VRAMWrite<u32>;
+			}
+			break;
 		}
 	}
 	else
@@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 			}
 			break;
 		case 0x04800000:
-			if (addr < 0x04810000 && size == 16)
+			if (addr < 0x04810000 && size >= 16)
 			{
-				if (store)
-					return (void*)Wifi::Write;
-				else
-					return (void*)Wifi::Read;
+				switch (size | store)
+				{
+				case 16: return (void*)Wifi::Read;
+				case 17: return (void*)Wifi::Write;
+				case 32: return (void*)WifiRead32;
+				case 33: return (void*)WifiWrite32;
+				}
 			}
 			break;
+		case 0x06000000:
+		case 0x06800000:
+			switch (size | store)
+			{
+			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
+			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
+			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
+			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
+			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
+			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
+			}
 		}
 	}
 	return NULL;
 }
 
 }
-
-template void ARMJIT::LinkBlock<0>(ARM*, u32);
-template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index cab385f..44a6140 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -28,45 +28,60 @@ extern const u32 ExeMemRegionSizes[];
 
 typedef u32 (*JitBlockEntry)();
 
-extern u32 AddrTranslate9[0x2000];
-extern u32 AddrTranslate7[0x4000];
-
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
 
-template <u32 num>
-inline bool IsMapped(u32 addr)
-{
-	if (num == 0)
-		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
-	else
-		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
-}
-
-template <u32 num>
-inline u32 TranslateAddr(u32 addr)
-{
-	if (num == 0)
-		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
-	else
-		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
-}
+u32 TranslateAddr9(u32 addr);
+u32 TranslateAddr7(u32 addr);
 
+template <u32 Num>
 JitBlockEntry LookUpBlockEntry(u32 addr);
 
-
 void Init();
 void DeInit();
 
-void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true);
-void InvalidateAll();
+void Reset();
+
+void InvalidateByAddr(u32 pseudoPhysical);
+
+void InvalidateRegionIfNecessary(u32 addr);
 
-void InvalidateITCM(u32 addr);
-void InvalidateByAddr7(u32 addr);
+inline void InvalidateMainRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)));
+}
+inline void InvalidateITCMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF));
+}
+inline void InvalidateLCDCIfNecessary(u32 addr)
+{
+	if (addr < 0x68A3FFF)
+		InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000));
+}
+inline void InvalidateSWRAM7IfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask));
+}
+inline void InvalidateSWRAM9IfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask));
+}
+inline void InvalidateARM7WRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF));
+}
+inline void InvalidateARM7WVRAMIfNecessary(u32 addr)
+{
+	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF));
+}
 
 void CompileBlock(ARM* cpu);
 
 void ResetBlockCache();
 
+void UpdateMemoryStatus9(u32 start, u32 end);
+void UpdateMemoryStatus7(u32 start, u32 end);
+
 }
 
 extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 00fa436..a67f357 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
@@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD()
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02)
+        if ((CurInstr.DataRegion >> 24) == 0x02)
         {
             if (CodeRegion == 0x02)
                 cycles += numC + numD;
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 66d1808..4e45760 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -152,30 +152,34 @@ struct __attribute__((packed)) TinyVector
 class JitBlock
 {
 public:
-	JitBlock(u32 numInstrs, u32 numAddresses)
+	JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals)
 	{
-		NumInstrs = numInstrs;
+		Num = num;
 		NumAddresses = numAddresses;
-		Data.SetLength(numInstrs + numAddresses);
+		NumLiterals = numLiterals;
+		Data.SetLength(numAddresses * 2 + numLiterals);
 	}
 
-	u32 StartAddr;
 	u32 PseudoPhysicalAddr;
-	
-	u32 NumInstrs;
-	u32 NumAddresses;
+
+	u32 InstrHash, LiteralHash;
+	u8 Num;
+	u16 NumAddresses;
+	u16 NumLiterals;
 
 	JitBlockEntry EntryPoint;
 
-	u32* Instrs()
-	{ return &Data[0]; }
 	u32* AddressRanges()
-	{ return &Data[NumInstrs]; }
+	{ return &Data[0]; }
+	u32* AddressMasks()
+	{ return &Data[NumAddresses]; }
+	u32* Literals()
+	{ return &Data[NumAddresses * 2]; }
 	u32* Links()
-	{ return &Data[NumInstrs + NumAddresses]; }
+	{ return &Data[NumAddresses * 2 + NumLiterals]; }
 
 	u32 NumLinks()
-	{ return Data.Length - NumInstrs - NumAddresses; }
+	{ return Data.Length - NumAddresses * 2 - NumLiterals; }
 
 	void AddLink(u32 link)
 	{
@@ -184,7 +188,7 @@ public:
 
 	void ResetLinks()
 	{
-		Data.SetLength(NumInstrs + NumAddresses);
+		Data.SetLength(NumAddresses * 2 + NumLiterals);
 	}
 
 private:
@@ -200,8 +204,7 @@ private:
 struct __attribute__((packed)) AddressRange
 {
 	TinyVector<JitBlock*> Blocks;
-	u16 InvalidLiterals;
-	u16 TimesInvalidated;
+	u32 Code;
 };
 
 extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
@@ -210,14 +213,45 @@ typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
-extern u8 MemRegion9[0x80000];
-extern u8 MemRegion7[0x80000];
+extern u8 MemoryStatus9[0x800000];
+extern u8 MemoryStatus7[0x800000];
+
+extern TinyVector<u32> InvalidLiterals;
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset);
 
+enum
+{
+	memregion_Other = 0,
+	memregion_ITCM,
+	memregion_DTCM,
+	memregion_BIOS9,
+	memregion_MainRAM,
+	memregion_SWRAM9,
+	memregion_SWRAM7,
+	memregion_IO9,
+	memregion_VRAM,
+	memregion_BIOS7,
+	memregion_WRAM7,
+	memregion_IO7,
+	memregion_Wifi,
+	memregion_VWRAM,
+};
+
+int ClassifyAddress9(u32 addr);
+int ClassifyAddress7(u32 addr);
+
+template <typename T> T SlowRead9(ARMv5* cpu, u32 addr);
+template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val);
+template <typename T> T SlowRead7(u32 addr);
+template <typename T> void SlowWrite7(u32 addr, T val);
+
+template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
+template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index 5e18e84..0547c84 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -95,20 +95,6 @@ public:
         LiteralsLoaded = 0;
     }
 
-    BitSet32 GetPushRegs()
-    {
-        BitSet16 used;
-        for (int i = 0; i < InstrsCount; i++)
-            used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs);
-
-        BitSet32 res;
-        u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable);
-        for (int i = 0; i < registersMax; i++)
-            res |= BitSet32(1 << (int)NativeRegAllocOrder[i]);
-
-        return res;
-    }
-
 	void Prepare(bool thumb, int i)
     {
         FetchedInstr instr = Instrs[i];
@@ -139,7 +125,6 @@ public:
             UnloadRegister(reg);
 
         u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
-        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -182,13 +167,12 @@ public:
                     if (left-- == 0)
                         break;
 
-                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
                     LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
                 }
             }
         }
 
-        DirtyRegs |= writeRegs & ~(1 << 15);
+        DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index dd20e3c..eee2e0f 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -195,26 +195,6 @@ Compiler::Compiler()
 
     Reset();
 
-    for (int i = 0; i < 3; i++)
-    {
-        for (int j = 0; j < 2; j++)
-            MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-    }
-    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
-    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
-    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
-    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
-    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
-    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
-
-    for (int i = 0; i < 2; i++)
-        for (int j = 0; j < 2; j++)
-        {
-            MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j);
-            MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false);
-            MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true);
-        }
-
     {
         // RSCRATCH mode
         // RSCRATCH2 reg number
@@ -317,6 +297,12 @@ Compiler::Compiler()
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
+
+    NearStart = ResetStart;
+    FarStart = ResetStart + 1024*1024*24;
+
+    NearSize = FarStart - ResetStart;
+    FarSize = (ResetStart + CodeMemSize) - FarStart;
 }
 
 void Compiler::LoadCPSR()
@@ -504,6 +490,9 @@ void Compiler::Reset()
 {
     memset(ResetStart, 0xcc, CodeMemSize);
     SetCodePtr(ResetStart);
+
+    NearCode = NearStart;
+    FarCode = FarStart;
 }
 
 void Compiler::Comp_SpecialBranchBehaviour(bool taken)
@@ -544,8 +533,16 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 
 JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
-    if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
+    if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess...
+    {
+        printf("near reset\n");
+        ResetBlockCache();
+    }
+    if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess...
+    {
+        printf("far reset\n");
         ResetBlockCache();
+    }
 
     ConstantCycles = 0;
     Thumb = thumb;
@@ -762,12 +759,14 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
+        IrregularCycles = true;
+
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
         s32 numD = CurInstr.DataCycles;
 
-        if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
                 cycles = numC + numD;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index e0a4978..9df218b 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -140,7 +140,7 @@ public:
     };
     void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
-    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
+    bool Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -154,12 +154,6 @@ public:
 
     void Comp_SpecialBranchBehaviour(bool taken);
 
-    void* Gen_MemoryRoutine9(bool store, int size);
-
-    void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
-    void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
-
-    void* Gen_ChangeCPSRRoutine();
 
     Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
@@ -193,6 +187,26 @@ public:
         return (u8*)entry - ResetStart;
     }
 
+    void SwitchToNearCode()
+    {
+        FarCode = GetWritableCodePtr();
+        SetCodePtr(NearCode);
+    }
+
+    void SwitchToFarCode()
+    {
+        NearCode = GetWritableCodePtr();
+        SetCodePtr(FarCode);
+    }
+
+    u8* FarCode;
+    u8* NearCode;
+    u32 FarSize;
+    u32 NearSize;
+
+    u8* NearStart;
+    u8* FarStart;
+
     u8* ResetStart;
     u32 CodeMemSize;
 
@@ -201,12 +215,6 @@ public:
 
     void* BranchStub[2];
 
-    void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2];
-
-    void* MemoryFuncsSeq9[2][2];
-    void* MemoryFuncsSeq7[2][2][2];
-
     void* ReadBanked;
     void* WriteBanked;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b595e32..c13b779 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -25,236 +25,17 @@ int squeezePointer(T* ptr)
     improvement.
 */
 
-/*
-    address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
-    store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-*/
-void* Compiler::Gen_MemoryRoutine9(bool store, int size)
+bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
 {
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-    FixupBranch insideDTCM = J_CC(CC_B);
-
-    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-    FixupBranch insideITCM = J_CC(CC_B);
-
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM9Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM9Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM9Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            // everything's already in the appropriate register
-            ABI_CallFunction(NDS::ARM9Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM9Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM9Read8, true);
-    }
-
-    SetJumpTarget(insideDTCM);
-    AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
-    if (store)
-        MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
+    u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
 
-    SetJumpTarget(insideITCM);
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
-    AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
-    if (store)
-    {
-        MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        
-        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
-        static_assert(sizeof(AddressRange) == 16);
-        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
-        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-        SHR(32, R(RSCRATCH), Imm8(9));
-        SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
-        FixupBranch noCode = J_CC(CC_Z);
-        JMP((u8*)InvalidateByAddr, true);
-        SetJumpTarget(noCode);
-    }
-    else
+    int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
+    if (invalidLiteralIdx != -1)
     {
-        MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
     }
-    RET();
-
-    static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!");
-
-    return res;
-}
-
-#define MEMORY_SEQ_WHILE_COND \
-        if (!store) \
-            MOV(32, currentElement, R(EAX));\
-        if (!preinc) \
-            ADD(32, R(ABI_PARAM1), Imm8(4)); \
-        \
-        SUB(32, R(ABI_PARAM3), Imm8(1)); \
-        J_CC(CC_NZ, repeat);
-
-/*
-    ABI_PARAM1 address
-    ABI_PARAM2 address where registers are stored
-    ABI_PARAM3 how many values to read/write
-
-    Dolphin x64CodeEmitter is my favourite assembler
- */
-void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    const u8* repeat = GetCodePtr();
-
-    if (preinc)
-        ADD(32, R(ABI_PARAM1), Imm8(4));
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
-    FixupBranch insideDTCM = J_CC(CC_B);
-
-    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
-    FixupBranch insideITCM = J_CC(CC_B);
-
-    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster
-
-    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-    AND(32, R(ABI_PARAM1), Imm8(~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM2), currentElement);
-        CALL((void*)NDS::ARM9Write32);
-    }
-    else
-        CALL((void*)NDS::ARM9Read32);
-    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    SetJumpTarget(insideDTCM);
-    AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM4), currentElement);
-        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4));
-    }
-    else
-        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    SetJumpTarget(insideITCM);
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM4), currentElement);
-        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-
-        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
-        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
-        SHR(32, R(RSCRATCH), Imm8(9));
-        SHL(32, R(RSCRATCH), Imm8(4));
-        CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
-        FixupBranch noCode = J_CC(CC_Z);
-        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
-        CALL((u8*)InvalidateByAddr);
-        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-        SetJumpTarget(noCode);
-    }
-    else
-        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    return res;
-}
-
-void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
-{
-    void* res = (void*)GetWritableCodePtr();
-
-    const u8* repeat = GetCodePtr();
-
-    if (preinc)
-        ADD(32, R(ABI_PARAM1), Imm8(4));
-
-    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8);
-
-    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-    AND(32, R(ABI_PARAM1), Imm8(~3));
-    if (store)
-    {
-        MOV(32, R(ABI_PARAM2), currentElement);
-        CALL((void*)NDS::ARM7Write32);
-    }
-    else
-        CALL((void*)NDS::ARM7Read32);
-    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
-
-    MEMORY_SEQ_WHILE_COND
-    RET();
-
-    return res;
-}
-
-#undef MEMORY_SEQ_WHILE_COND
-
-void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
-{
     u32 val;
     // make sure arm7 bios is accessible
     u32 tmpR15 = CurCPU->R[15];
@@ -276,12 +57,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
         RegCache.PutLiteral(rd, val);
 
     Comp_AddCycles_CDI();
+
+    return true;
 }
 
-/*void fault(u32 a, u32 b, u32 c, u32 d)
-{
-    printf("actually not static! %x %x %x %x\n", a, b, c, d);
-}*/
 
 void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
@@ -291,17 +70,12 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    //bool check = false;
     if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
-
-        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
-        {
-            Comp_MemLoadLiteral(size, rd, addr);
+        
+        if (Comp_MemLoadLiteral(size, rd, addr))
             return;
-        }
     }
 
     {
@@ -314,173 +88,334 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             Comp_AddCycles_CDI();
         }
 
+        bool addrIsStatic = Config::JIT_LiteralOptimisations
+            && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
+        u32 staticAddress;
+        if (addrIsStatic)
+            staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         OpArg rdMapped = MapReg(rd);
-        OpArg rnMapped = MapReg(rn);
-        if (Thumb && rn == 15)
-            rnMapped = Imm32(R15 & ~0x2);
-
-        bool inlinePreparation = Num == 1;
-        u32 constLocalROR32 = 4;
-
-        void* memoryFunc = Num == 0
-            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
-            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (!addrIsStatic)
         {
-            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-
-            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
-            MOV(32, R(ABI_PARAM1), Imm32(R15));
-            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
-            CMP(32, R(RSCRATCH), Imm32(addr));
-            FixupBranch eq = J_CC(CC_E);
-            CALL((void*)fault);
-            SetJumpTarget(eq);*/
-
-            NDS::MemRegion region;
-            region.Mem = NULL;
-            if (Num == 0)
+            OpArg rnMapped = MapReg(rn);
+            if (Thumb && rn == 15)
+                rnMapped = Imm32(R15 & ~0x2);
+
+            X64Reg finalAddr = RSCRATCH3;
+            if (flags & memop_Post)
             {
-                ARMv5* cpu5 = (ARMv5*)CurCPU;
+                MOV(32, R(RSCRATCH3), rnMapped);
 
-                // stupid dtcm...
-                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
-                {
-                    // disable this for now as DTCM is located in heap
-                    // which might excced the RIP-addressable range
-                    //region.Mem = cpu5->DTCM;
-                    //region.Mask = 0x3FFF;
-                }
-                else
-                {
-                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
-                }
+                finalAddr = rnMapped.GetSimpleReg();
             }
-            else
-                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
 
-            if (region.Mem != NULL)
+            if (op2.IsImm)
+            {
+                MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            }
+            else
             {
-                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+                OpArg rm = MapReg(op2.Reg.Reg);
 
-                if (flags & memop_Store)
+                if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                    && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
                 {
-                    MOV(size, M(ptr), MapReg(rd));
+                    LEA(32, finalAddr, 
+                        MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
                 }
                 else
                 {
-                    if (flags & memop_SignExtend)
-                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
-                    else
-                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    bool throwAway;
+                    OpArg offset =
+                        Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
 
-                    if (size == 32 && addr & ~0x3)
+                    if (flags & memop_SubtractOffset)
                     {
-                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                        if (R(finalAddr) != rnMapped)
+                            MOV(32, R(finalAddr), rnMapped);
+                        if (!offset.IsZero())
+                            SUB(32, R(finalAddr), offset);
                     }
+                    else
+                        MOV_sum(32, finalAddr, rnMapped, offset);
                 }
-
-                return;
             }
 
-            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
-            if (specialFunc)
-            {
-                memoryFunc = specialFunc;
-                inlinePreparation = true;
-                constLocalROR32 = addr & 0x3;
-            }
+            if ((flags & memop_Writeback) && !(flags & memop_Post))
+                MOV(32, rnMapped, R(finalAddr));
         }
 
-        X64Reg finalAddr = ABI_PARAM1;
-        if (flags & memop_Post)
-        {
-            MOV(32, R(ABI_PARAM1), rnMapped);
+        int expectedTarget = Num == 0
+            ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
+            : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
+        if (CurInstr.Cond() < 0xE)
+            expectedTarget = memregion_Other;
+
+        bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store);
 
-            finalAddr = rnMapped.GetSimpleReg();
+        switch (expectedTarget)
+        {
+        case memregion_MainRAM:
+        case memregion_DTCM:
+        case memregion_WRAM7:
+        case memregion_SWRAM9:
+        case memregion_SWRAM7:
+        case memregion_IO9:
+        case memregion_IO7:
+        case memregion_VWRAM:
+            compileFastPath = true;
+            break;
+        case memregion_Wifi:
+            compileFastPath = size >= 16;
+            break;
+        case memregion_VRAM:
+            compileFastPath = !(flags & memop_Store) || size >= 16;
+        case memregion_BIOS9:
+            compileFastPath = !(flags & memop_Store);
+            break;
+        default: break;
         }
 
-        if (op2.IsImm)
+        if (addrIsStatic && !compileFastPath)
         {
-            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            compileFastPath = false;
+            compileSlowPath = true;
         }
-        else
+
+        if (addrIsStatic && compileSlowPath)
+            MOV(32, R(RSCRATCH3), Imm32(staticAddress));
+
+        if (compileFastPath)
         {
-            OpArg rm = MapReg(op2.Reg.Reg);
+            FixupBranch slowPath;
+            if (compileSlowPath)
+            {
+                MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                SHR(32, R(RSCRATCH), Imm8(9));
+                if (flags & memop_Store)
+                {
+                    CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
+                }
+                else
+                {
+                    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
+                    AND(32, R(RSCRATCH), Imm8(~0x80));
+                    CMP(32, R(RSCRATCH), Imm8(expectedTarget));
+                }
+
+                slowPath = J_CC(CC_NE, true);
+            }
 
-            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
-                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7
+                || expectedTarget == memregion_BIOS9)
             {
-                LEA(32, finalAddr, 
-                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+                u8* data;
+                u32 mask;
+                if (expectedTarget == memregion_MainRAM)
+                {
+                    data = NDS::MainRAM;
+                    mask = MAIN_RAM_SIZE - 1;
+                }
+                else if (expectedTarget == memregion_BIOS9)
+                {
+                    data = NDS::ARM9BIOS;
+                    mask = 0xFFF;
+                }
+                else
+                {
+                    data = NDS::ARM7WRAM;
+                    mask = 0xFFFF;
+                }
+                OpArg memLoc;
+                if (addrIsStatic)
+                {
+                    memLoc = M(data + ((staticAddress & mask & addressMask)));
+                }
+                else
+                {
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                    AND(32, R(RSCRATCH), Imm32(mask & addressMask));
+                    memLoc = MDisp(RSCRATCH, squeezePointer(data));
+                }
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
             }
-            else
+            else if (expectedTarget == memregion_DTCM)
+            {
+                if (addrIsStatic)
+                    MOV(32, R(RSCRATCH), Imm32(staticAddress));
+                else
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+                AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
+                OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM));
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
+            }
+            else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7)
             {
-                bool throwAway;
-                OpArg offset =
-                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
-                
-                if (flags & memop_SubtractOffset)
+                MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
+                if (addrIsStatic)
                 {
-                    if (R(finalAddr) != rnMapped)
-                        MOV(32, R(finalAddr), rnMapped);
-                    if (!offset.IsZero())
-                        SUB(32, R(finalAddr), offset);
+                    MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask));
                 }
                 else
-                    MOV_sum(32, finalAddr, rnMapped, offset);
+                {
+                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
+                    AND(32, R(RSCRATCH), Imm8(addressMask));
+                }
+                AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
+                OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2);
+                if (flags & memop_Store)
+                    MOV(size, memLoc, rdMapped);
+                else if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
             }
-        }
+            else
+            {
+                u32 maskedDataRegion;
 
-        if ((flags & memop_Writeback) && !(flags & memop_Post))
-            MOV(32, rnMapped, R(finalAddr));
+                if (addrIsStatic)
+                {
+                    maskedDataRegion = staticAddress;
+                    MOV(32, R(ABI_PARAM1), Imm32(staticAddress));
+                }
+                else
+                {
+                    if (ABI_PARAM1 != RSCRATCH3)
+                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                    AND(32, R(ABI_PARAM1), Imm8(addressMask));
 
-        if (flags & memop_Store)
-            MOV(32, R(ABI_PARAM2), rdMapped);
+                    maskedDataRegion = CurInstr.DataRegion;
+                    if (Num == 0)
+                        maskedDataRegion &= ~0xFFFFFF;
+                    else
+                        maskedDataRegion &= ~0x7FFFFF;
+                }
 
-        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
-            MOV(32, rdMapped, R(ABI_PARAM1));
+                void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size);
 
-        if (inlinePreparation && size > 8)
-            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM2), rdMapped);
 
-        CALL(memoryFunc);
+                    ABI_CallFunction((void(*)())func);
+                }
+                else
+                {
+                    if (!addrIsStatic)
+                        MOV(32, rdMapped, R(RSCRATCH3));
 
-        /*if (Num == 0 && check)
-        {
-            CMP(32, R(EAX), rdMapped);
-            FixupBranch notEqual = J_CC(CC_E);
-            ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0);
-            MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8)));
-            MOV(32, R(ABI_PARAM2), R(EAX));
-            MOV(32, R(ABI_PARAM3), rdMapped);
-            MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr));
-            CALL((u8*)fault);
-            ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0);
-            SetJumpTarget(notEqual);
-        }*/
-
-        if (!(flags & memop_Store))
-        {
-            if (inlinePreparation && size == 32)
+                    ABI_CallFunction((void(*)())func);
+
+                    if (!addrIsStatic)
+                        MOV(32, R(RSCRATCH3), rdMapped);
+
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                }
+            }
+
+            if ((size == 32 && !(flags & memop_Store)))
             {
-                if (constLocalROR32 == 4)
+                if (addrIsStatic)
+                {
+                    if (staticAddress & 0x3)
+                        ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8));
+                }
+                else
                 {
-                    static_assert(RSCRATCH3 == ECX);
-                    MOV(32, R(ECX), rdMapped);
-                    AND(32, R(ECX), Imm8(3));
-                    SHL(32, R(ECX), Imm8(3));
-                    ROR_(32, R(RSCRATCH), R(ECX));
+                    AND(32, R(RSCRATCH3), Imm8(0x3));
+                    SHL(32, R(RSCRATCH3), Imm8(3));
+                    ROR_(32, rdMapped, R(RSCRATCH3));
                 }
-                else if (constLocalROR32 != 0)
-                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
             }
 
-            if (flags & memop_SignExtend)
-                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            if (compileSlowPath)
+            {
+                SwitchToFarCode();
+                SetJumpTarget(slowPath);
+            }
+        }
+
+        if (compileSlowPath)
+        {
+            if (Num == 0)
+            {
+                MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM3), rdMapped);
+
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowWrite9<u32>); break;
+                    case 16: CALL((void*)&SlowWrite9<u16>); break;
+                    case 8: CALL((void*)&SlowWrite9<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowRead9<u32>); break;
+                    case 16: CALL((void*)&SlowRead9<u16>); break;
+                    case 8: CALL((void*)&SlowRead9<u8>); break;
+                    }
+                }
+            }
             else
-                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            {
+                if (ABI_PARAM1 != RSCRATCH3)
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                if (flags & memop_Store)
+                {
+                    MOV(32, R(ABI_PARAM2), rdMapped);
+
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowWrite7<u32>); break;
+                    case 16: CALL((void*)&SlowWrite7<u16>); break;
+                    case 8: CALL((void*)&SlowWrite7<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: CALL((void*)&SlowRead7<u32>); break;
+                    case 16: CALL((void*)&SlowRead7<u16>); break;
+                    case 8: CALL((void*)&SlowRead7<u8>); break;
+                    }
+                }
+            }
+            if (!(flags & memop_Store))
+            {
+                if (flags & memop_SignExtend)
+                    MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                else
+                    MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            }
+        }
+
+        if (compileFastPath && compileSlowPath)
+        {
+            FixupBranch ret = J(true);
+            SwitchToNearCode();
+            SetJumpTarget(ret);
         }
 
         if (!(flags & memop_Store) && rd == 15)
@@ -498,100 +433,160 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
 s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
 {
-    IrregularCycles = true;
-
     int regsCount = regs.Count();
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
     // we need to make sure that the stack stays aligned to 16 bytes
+#ifdef _WIN32
+    // include shadow
+    u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8;
+#else
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
+#endif
+    u32 allocOffset = stackAlloc - regsCount * 8;
 
-    if (!store)
+    int expectedTarget = Num == 0
+        ? ClassifyAddress9(CurInstr.DataRegion)
+        : ClassifyAddress7(CurInstr.DataRegion);
+    if (usermode || CurInstr.Cond() < 0xE)
+        expectedTarget = memregion_Other;
+
+    bool compileFastPath = false;
+
+    switch (expectedTarget)
     {
+    case memregion_DTCM:
+    case memregion_MainRAM:
+    case memregion_SWRAM9:
+    case memregion_SWRAM7:
+    case memregion_WRAM7:
+        compileFastPath = true;
+        break;
+    default:
+        break;
+    }
+
+    if (!store)
         Comp_AddCycles_CDI();
+    else
+        Comp_AddCycles_CD();
 
-        if (decrement)
+    if (decrement)
+    {
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4));
+        preinc ^= true;
+    }
+    else
+        MOV(32, R(RSCRATCH4), MapReg(rn));
+
+    if (compileFastPath)
+    {
+        assert(!usermode);
+
+        MOV(32, R(RSCRATCH), R(RSCRATCH4));
+        SHR(32, R(RSCRATCH), Imm8(9));
+
+        if (store)
         {
-            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-            preinc ^= true;
+            CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
         }
         else
-            MOV(32, R(ABI_PARAM1), MapReg(rn));
-
-        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
-        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
-        MOV(64, R(ABI_PARAM2), R(RSP));
-
-        CALL(Num == 0
-            ? MemoryFuncsSeq9[0][preinc]
-            : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);
+        {
+            MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
+            AND(32, R(RSCRATCH), Imm8(~0x80));
+            CMP(32, R(RSCRATCH), Imm8(expectedTarget));
+        }
+        FixupBranch slowPath = J_CC(CC_NE, true);
 
-        bool firstUserMode = true;
-        for (int reg = 15; reg >= 0; reg--)
+        if (expectedTarget == memregion_DTCM)
         {
-            if (regs[reg])
+            SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
+            AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3));
+            LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM)));
+        }
+        else if (expectedTarget == memregion_MainRAM)
+        {
+            AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3));
+            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM)));
+        }
+        else if (expectedTarget == memregion_WRAM7)
+        {
+            AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3));
+            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM)));
+        }
+        else // SWRAM
+        {
+            AND(32, R(RSCRATCH4), Imm8(~3));
+            AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
+            ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
+        }
+        u32 offset = 0;
+        for (int reg : regs)
+        {
+            if (preinc)
+                offset += 4;
+            OpArg mem = MDisp(RSCRATCH4, offset);
+            if (store)
             {
-                if (usermode && !regs[15] && reg >= 8 && reg < 15)
+                if (RegCache.LoadedRegs & (1 << reg))
                 {
-                    if (firstUserMode)
-                    {
-                        MOV(32, R(RSCRATCH), R(RCPSR));
-                        AND(32, R(RSCRATCH), Imm8(0x1F));
-                        firstUserMode = false;
-                    }
-                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
-                    POP(RSCRATCH3);
-                    CALL(WriteBanked);
-                    FixupBranch sucessfulWritten = J_CC(CC_NC);
-                    if (RegCache.Mapping[reg] != INVALID_REG)
-                        MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
-                    else
-                        SaveReg(reg, RSCRATCH3);
-                    SetJumpTarget(sucessfulWritten);
+                    MOV(32, mem, MapReg(reg));
                 }
-                else if (RegCache.Mapping[reg] == INVALID_REG)
+                else
                 {
-                    assert(reg != 15);
-
-                    POP(RSCRATCH);
-                    SaveReg(reg, RSCRATCH);
+                    LoadReg(reg, RSCRATCH);
+                    MOV(32, mem, R(RSCRATCH));
+                }
+            }
+            else
+            {
+                if (RegCache.LoadedRegs & (1 << reg))
+                {
+                    MOV(32, MapReg(reg), mem);
                 }
                 else
                 {
-                    if (reg != 15)
-                        RegCache.DirtyRegs |= (1 << reg);
-                    POP(MapReg(reg).GetSimpleReg());
+                    MOV(32, R(RSCRATCH), mem);
+                    SaveReg(reg, RSCRATCH);
                 }
             }
+            if (!preinc)
+                offset += 4;
         }
 
-        if (regsCount & 1)
-            POP(RSCRATCH);
+        SwitchToFarCode();
+        SetJumpTarget(slowPath);
+    }
+
+    if (!store)
+    {
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+        if (allocOffset == 0)
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        else
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        if (regs[15])
+        switch (Num * 2 | preinc)
         {
-            if (Num == 1)
-            {
-                if (Thumb)
-                    OR(32, MapReg(15), Imm8(1));
-                else
-                    AND(32, MapReg(15), Imm8(0xFE));
-            }
-            Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+        case 0: CALL((void*)&SlowBlockTransfer9<false, false>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, false>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, false>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
         }
-    }
-    else
-    {
-        Comp_AddCycles_CD();
 
-        if (regsCount & 1)
-            PUSH(RSCRATCH);
+        if (allocOffset)
+            ADD(64, R(RSP), Imm8(allocOffset));
 
         bool firstUserMode = true;
         for (int reg : regs)
         {
-            if (usermode && reg >= 8 && reg < 15)
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
             {
                 if (firstUserMode)
                 {
@@ -599,43 +594,107 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     AND(32, R(RSCRATCH), Imm8(0x1F));
                     firstUserMode = false;
                 }
-                if (RegCache.Mapping[reg] == INVALID_REG)
-                    LoadReg(reg, RSCRATCH3);
-                else
-                    MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
                 MOV(32, R(RSCRATCH2), Imm32(reg - 8));
-                CALL(ReadBanked);
-                PUSH(RSCRATCH3);
+                POP(RSCRATCH3);
+                CALL(WriteBanked);
+                FixupBranch sucessfulWritten = J_CC(CC_NC);
+                if (RegCache.LoadedRegs & (1 << reg))
+                    MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
+                else
+                    SaveReg(reg, RSCRATCH3);
+                SetJumpTarget(sucessfulWritten);
             }
-            else if (RegCache.Mapping[reg] == INVALID_REG)
+            else if (!(RegCache.LoadedRegs & (1 << reg)))
             {
-                LoadReg(reg, RSCRATCH);
-                PUSH(RSCRATCH);
+                assert(reg != 15);
+
+                POP(RSCRATCH);
+                SaveReg(reg, RSCRATCH);
             }
             else
             {
-                PUSH(MapReg(reg).GetSimpleReg());
+                POP(MapReg(reg).GetSimpleReg());
             }
         }
-
-        if (decrement)
+    }
+    else
+    {
+        bool firstUserMode = true;
+        for (int reg = 15; reg >= 0; reg--)
         {
-            MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4));
-            preinc ^= true;
+            if (regs[reg])
+            {
+                if (usermode && reg >= 8 && reg < 15)
+                {
+                    if (firstUserMode)
+                    {
+                        MOV(32, R(RSCRATCH), R(RCPSR));
+                        AND(32, R(RSCRATCH), Imm8(0x1F));
+                        firstUserMode = false;
+                    }
+                    if (RegCache.Mapping[reg] == INVALID_REG)
+                        LoadReg(reg, RSCRATCH3);
+                    else
+                        MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
+                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                    CALL(ReadBanked);
+                    PUSH(RSCRATCH3);
+                }
+                else if (!(RegCache.LoadedRegs & (1 << reg)))
+                {
+                    LoadReg(reg, RSCRATCH);
+                    PUSH(RSCRATCH);
+                }
+                else
+                {
+                    PUSH(MapReg(reg).GetSimpleReg());
+                }
+            }
         }
-        else
-            MOV(32, R(ABI_PARAM1), MapReg(rn));
 
-        MOV(64, R(ABI_PARAM2), R(RSP));
+        if (allocOffset)
+            SUB(64, R(RSP), Imm8(allocOffset));
+
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        if (allocOffset)
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+        else
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        CALL(Num == 0
-            ? MemoryFuncsSeq9[1][preinc]
-            : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);
+        switch (Num * 2 | preinc)
+        {
+        case 0: CALL((void*)&SlowBlockTransfer9<false, true>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, true>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, true>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, true>); break;
+        }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     }
 
+    if (compileFastPath)
+    {
+        FixupBranch ret = J(true);
+        SwitchToNearCode();
+        SetJumpTarget(ret);
+    }
+
+    if (!store && regs[15])
+    {
+        if (Num == 1)
+        {
+            if (Thumb)
+                OR(32, MapReg(15), Imm8(1));
+            else
+                AND(32, MapReg(15), Imm8(0xFE));
+        }
+        Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+    }
+
     return offset;
 }
 
@@ -786,9 +845,7 @@ void Compiler::T_Comp_LoadPCRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) << 2;
     u32 addr = (R15 & ~0x2) + offset;
-    if (Config::JIT_LiteralOptimisations)
-        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
-    else
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr))
         Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 28362d9..b50e821 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -373,16 +373,16 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
         {
-            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
-            res.NotStrictlyNeeded |= set;
+            u32 set = (instr & 0xFF);
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
             res.DstRegs |= set;
         }
         if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
         {
-            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            u32 set = (instr & 0xFF);
             if (res.Kind == tk_PUSH && instr & (1 << 8))
                 set |= (1 << 14);
-            res.NotStrictlyNeeded |= set;
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
             res.SrcRegs |= set;
         }
 
@@ -495,15 +495,15 @@ Info Decode(bool thumb, u32 num, u32 instr)
         
         if (res.Kind == ak_LDM)
         {
-            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
             res.DstRegs |= set;
-            res.NotStrictlyNeeded |= set;
         }
         if (res.Kind == ak_STM)
         {
-            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
             res.SrcRegs |= set;
-            res.NotStrictlyNeeded |= set;
         }
 
         if ((instr >> 28) < 0xE)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index ff8531c..225847e 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -98,6 +98,10 @@ void ARMv5::CP15DoSavestate(Savestate* file)
 
 void ARMv5::UpdateDTCMSetting()
 {
+#ifdef JIT_ENABLED
+    u32 oldDTCMBase = DTCMBase;
+    u32 oldDTCMSize = DTCMSize;
+#endif
     if (CP15Control & (1<<16))
     {
         DTCMBase = DTCMSetting & 0xFFFFF000;
@@ -110,10 +114,20 @@ void ARMv5::UpdateDTCMSetting()
         DTCMSize = 0;
         //printf("DTCM disabled\n");
     }
+#ifdef JIT_ENABLED
+    if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize)
+    {
+        ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize);
+        ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize);
+    }
+#endif
 }
 
 void ARMv5::UpdateITCMSetting()
 {
+#ifdef JIT_ENABLED
+    u32 oldITCMSize = ITCMSize;
+#endif
     if (CP15Control & (1<<18))
     {
         ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F);
@@ -124,6 +138,10 @@ void ARMv5::UpdateITCMSetting()
         ITCMSize = 0;
         //printf("ITCM disabled\n");
     }
+#ifdef JIT_ENABLED
+    if (oldITCMSize != ITCMSize)
+        ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize));
+#endif
 }
 
 
@@ -562,15 +580,9 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateAll();
-#endif
         ICacheInvalidateAll();
         return;
     case 0x751:
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
-#endif
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -733,7 +745,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
 
 void ARMv5::DataRead8(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     if (addr < ITCMSize)
     {
@@ -754,7 +766,7 @@ void ARMv5::DataRead8(u32 addr, u32* val)
 
 void ARMv5::DataRead16(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~1;
 
@@ -777,7 +789,7 @@ void ARMv5::DataRead16(u32 addr, u32* val)
 
 void ARMv5::DataRead32(u32 addr, u32* val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~3;
 
@@ -821,14 +833,14 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
 
 void ARMv5::DataWrite8(u32 addr, u8 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -845,7 +857,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
 
 void ARMv5::DataWrite16(u32 addr, u16 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~1;
 
@@ -854,7 +866,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -871,7 +883,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
 
 void ARMv5::DataWrite32(u32 addr, u32 val)
 {
-    DataRegion = addr >> 12;
+    DataRegion = addr;
 
     addr &= ~3;
 
@@ -880,7 +892,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
@@ -904,7 +916,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCM(addr & 0x7FFF);
+        ARMJIT::InvalidateITCMIfNecessary(addr);
 #endif
         return;
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 7b6a450..56e7566 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -574,10 +574,6 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
-#ifdef JIT_ENABLED
-    ARMJIT::ResetBlockCache();
-#endif
-
     NDSCart::Reset();
     GBACart::Reset();
     GPU::Reset();
@@ -593,6 +589,10 @@ void Reset()
     }
 
     AREngine::Reset();
+
+#ifdef JIT_ENABLED
+    ARMJIT::Reset();
+#endif
 }
 
 void Stop()
@@ -1127,6 +1127,9 @@ void Halt()
 
 void MapSharedWRAM(u8 val)
 {
+    if (val == WRAMCnt)
+        return;
+
     WRAMCnt = val;
 
     switch (WRAMCnt & 0x3)
@@ -1159,6 +1162,11 @@ void MapSharedWRAM(u8 val)
         SWRAM_ARM7Mask = 0x7FFF;
         break;
     }
+
+#ifdef JIT_ENABLED
+    ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000);
+    ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000);
+#endif
 }
 
 
@@ -2020,11 +2028,17 @@ void ARM9Write8(u32 addr, u8 val)
     {
     case 0x02000000:
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -2070,11 +2084,17 @@ void ARM9Write16(u32 addr, u16 val)
     {
     case 0x02000000:
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -2095,7 +2115,12 @@ void ARM9Write16(u32 addr, u16 val)
         case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u16>(addr, val); return;
+        default:
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateLCDCIfNecessary(addr);
+#endif
+            GPU::WriteVRAM_LCDC<u16>(addr, val);
+            return;
         }
 
     case 0x07000000:
@@ -2136,11 +2161,17 @@ void ARM9Write32(u32 addr, u32 val)
     {
     case 0x02000000:
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return ;
 
     case 0x03000000:
         if (SWRAM_ARM9)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+#endif
             *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
         }
         return;
@@ -2161,7 +2192,12 @@ void ARM9Write32(u32 addr, u32 val)
         case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
+        default:
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateLCDCIfNecessary(addr);
+#endif
+            GPU::WriteVRAM_LCDC<u32>(addr, val);
+            return;
         }
 
     case 0x07000000:
@@ -2426,30 +2462,38 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2459,6 +2503,9 @@ void ARM7Write8(u32 addr, u8 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u8>(addr, val);
         return;
 
@@ -2489,30 +2536,38 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2530,6 +2585,9 @@ void ARM7Write16(u32 addr, u16 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u16>(addr, val);
         return;
 
@@ -2562,30 +2620,38 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::InvalidateByAddr7(addr);
-#endif
-
     switch (addr & 0xFF800000)
     {
     case 0x02000000:
     case 0x02800000:
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7)
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+#endif
             *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
             return;
         }
         else
         {
+#ifdef JIT_ENABLED
+            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
             *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
             return;
         }
 
     case 0x03800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+#endif
         *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
         return;
 
@@ -2604,6 +2670,9 @@ void ARM7Write32(u32 addr, u32 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+#endif
         GPU::WriteVRAM_ARM7<u32>(addr, val);
         return;
 
diff --git a/src/NDS.h b/src/NDS.h
index 9c5fe3d..6eda658 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -80,7 +80,7 @@ enum
     IRQ_IPCSendDone,
     IRQ_IPCRecv,
     IRQ_CartSendDone, // TODO: less misleading name
-    IRQ_CartIREQMC,   // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller)
+    IRQ_CartIREQMC,   // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller)
     IRQ_GXFIFO,
     IRQ_LidOpen,
     IRQ_SPI,
@@ -163,6 +163,13 @@ extern u16 ARM7BIOSProt;
 
 extern u8 MainRAM[0x1000000];
 extern u32 MainRAMMask;
+extern u8 SharedWRAM[0x8000];
+extern u8* SWRAM_ARM9;
+extern u8* SWRAM_ARM7;
+extern u32 SWRAM_ARM9Mask;
+extern u32 SWRAM_ARM7Mask;
+
+extern u8 ARM7WRAM[0x10000];
 
 extern u32 KeyInput;
 
-- 
cgit v1.2.3


From 80b88dbd05a66ad50108778d5f36e17f5b1cd661 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sat, 9 May 2020 14:34:52 +0200
Subject: allow allocating caller saved registers currently system-v only

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 19 ++----------
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 58 ++++++++++++++++++++++++++-----------
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  3 ++
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 19 ++++++++++++
 4 files changed, 65 insertions(+), 34 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cac590a..27c24c7 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -138,18 +138,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
-    if (restoreCPSR)
-    {
-        if (Thumb || CurInstr.Cond() >= 0xE)
-            RegCache.Flush();
-        else
-        {
-            // the ugly way...
-            // we only save them, to load and save them again
-            for (int reg : hiRegsLoaded)
-                SaveReg(reg, RegCache.Mapping[reg]);
-        }
-    }
+    PushRegs(restoreCPSR);
 
     MOV(64, R(ABI_PARAM1), R(RCPU));
     MOV(32, R(ABI_PARAM2), R(addr));
@@ -162,11 +151,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     else
         CALL((void*)&ARMv4::JumpTo);
 
-    if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
-    {
-        for (int reg : hiRegsLoaded)
-            LoadReg(reg, RegCache.Mapping[reg]);
-    }
+    PopRegs(restoreCPSR);
 
     LoadCPSR();
     // in case this instruction is skipped
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index eee2e0f..ef04601 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -26,7 +26,8 @@ const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 #ifdef _WIN32
     RBX, RSI, RDI, R12, R13, R14
 #else
-    RBX, R12, R13, R14 // this is sad
+    RBX, R12, R13, R14, // callee saved, this is sad
+    R9, R10, R11, // caller saved
 #endif
 };
 template <>
@@ -34,10 +35,46 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
     6
 #else
-    4
+    7
 #endif
 ;
 
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+
+    if (saveHiRegs)
+    {
+        BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+        for (int reg : hiRegsLoaded)
+        {
+            if (Thumb || CurInstr.Cond() == 0xE)
+                RegCache.UnloadRegister(reg);
+            else
+                SaveReg(reg, RegCache.Mapping[reg]);
+            // prevent saving the register twice
+            loadedRegs[reg] = false;
+        }
+    }
+
+    for (int reg : loadedRegs)
+        if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+            SaveReg(reg, RegCache.Mapping[reg]);
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+    for (int reg : loadedRegs)
+    {
+        if ((saveHiRegs && reg >= 8 && reg < 15)
+            || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+        {
+            LoadReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
 void Compiler::A_Comp_MRS()
 {
     Comp_AddCycles_C();
@@ -136,27 +173,14 @@ void Compiler::A_Comp_MSR()
             AND(32, R(RSCRATCH2), val);
             OR(32, R(RCPSR), R(RSCRATCH2));
 
-            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-            if (Thumb || CurInstr.Cond() >= 0xE)
-                RegCache.Flush();
-            else
-            {
-                // the ugly way...
-                // we only save them, to load and save them again
-                for (int reg : hiRegsLoaded)
-                    SaveReg(reg, RegCache.Mapping[reg]);
-            }
+            PushRegs(true);
 
             MOV(32, R(ABI_PARAM3), R(RCPSR));
             MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
             MOV(64, R(ABI_PARAM1), R(RCPU));
             CALL((void*)&ARM::UpdateMode);
 
-            if (!Thumb && CurInstr.Cond() < 0xE)
-            {
-                for (int reg : hiRegsLoaded)
-                    LoadReg(reg, RegCache.Mapping[reg]);
-            }
+            PopRegs(true);
         }
     }
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 9df218b..f2fc301 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -168,6 +168,9 @@ public:
 
     Gen::FixupBranch CheckCondition(u32 cond);
 
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
     Gen::OpArg MapReg(int reg)
     {
         if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index c13b779..b27efdd 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -283,6 +283,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             }
             else
             {
+                PushRegs(false);
+
                 u32 maskedDataRegion;
 
                 if (addrIsStatic)
@@ -310,6 +312,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     MOV(32, R(ABI_PARAM2), rdMapped);
 
                     ABI_CallFunction((void(*)())func);
+
+                    PopRegs(false);
                 }
                 else
                 {
@@ -318,6 +322,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
                     ABI_CallFunction((void(*)())func);
 
+                    PopRegs(false);
+
                     if (!addrIsStatic)
                         MOV(32, R(RSCRATCH3), rdMapped);
 
@@ -352,6 +358,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         if (compileSlowPath)
         {
+            PushRegs(false);
+
             if (Num == 0)
             {
                 MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
@@ -402,6 +410,9 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     }
                 }
             }
+
+            PopRegs(false);
+
             if (!(flags & memop_Store))
             {
                 if (flags & memop_SignExtend)
@@ -561,6 +572,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     if (!store)
     {
+        PushRegs(false);
+
         MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
@@ -580,6 +593,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
         }
 
+        PopRegs(false);
+
         if (allocOffset)
             ADD(64, R(RSP), Imm8(allocOffset));
 
@@ -655,6 +670,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (allocOffset)
             SUB(64, R(RSP), Imm8(allocOffset));
 
+        PushRegs(false);
+
         MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
         if (allocOffset)
             LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
@@ -674,6 +691,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+    
+        PopRegs(false);
     }
 
     if (compileFastPath)
-- 
cgit v1.2.3


From efb796640b6b4c140dd8e2924e740702d66e7823 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sat, 9 May 2020 14:36:18 +0200
Subject: use instr hash as key for restore candidates makes Golden Sun burn a
 little slower through the JIT memory

---
 src/ARMJIT.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 9602aed..8d87c76 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1042,13 +1042,13 @@ void CompileBlock(ARM* cpu)
 	u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4);
 	u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4);
 
-	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates.LookUp(instrHash);
 	bool mayRestore = true;
 	if (prevBlock)
 	{
-		RestoreCandidates.Remove(pseudoPhysicalAddr);
+		RestoreCandidates.Remove(instrHash);
 
-		mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash;
+		mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash;
 
 		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
@@ -1125,6 +1125,7 @@ void CompileBlock(ARM* cpu)
 void InvalidateByAddr(u32 pseudoPhysical)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+
 	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
 	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
 
@@ -1203,7 +1204,7 @@ void InvalidateByAddr(u32 pseudoPhysical)
 
 		if (!literalInvalidation)
 		{
-			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->InstrHash, block);
 			if (prevBlock)
 				delete prevBlock;
 		}
-- 
cgit v1.2.3


From c17f7b100e36edb1c728dbf21c77f9484d1820c6 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 9 May 2020 15:39:39 +0200
Subject: allow allocating caller saved regs on windows

---
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index ef04601..fd3fb70 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -24,7 +24,8 @@ template <>
 const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 {
 #ifdef _WIN32
-    RBX, RSI, RDI, R12, R13, R14
+    RBX, RSI, RDI, R12, R13, R14, // callee saved
+    R10, R11, // caller saved
 #else
     RBX, R12, R13, R14, // callee saved, this is sad
     R9, R10, R11, // caller saved
@@ -33,7 +34,7 @@ const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
 template <>
 const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #ifdef _WIN32
-    6
+    8
 #else
     7
 #endif
-- 
cgit v1.2.3


From fea9f95bba7475b2cd3b624a3ccd6cdee00a33f1 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Tue, 12 May 2020 16:09:20 +0200
Subject: fix inlined IO register access

---
 src/ARMJIT_x64/ARMJIT_Branch.cpp    | 1 -
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index 27c24c7..bda9e52 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -134,7 +134,6 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
 {
     IrregularCycles = true;
 
-    BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
     bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b27efdd..cf0bd23 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -283,8 +283,6 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             }
             else
             {
-                PushRegs(false);
-
                 u32 maskedDataRegion;
 
                 if (addrIsStatic)
@@ -309,6 +307,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
                 if (flags & memop_Store)
                 {
+                    PushRegs(false);
+
                     MOV(32, R(ABI_PARAM2), rdMapped);
 
                     ABI_CallFunction((void(*)())func);
@@ -320,6 +320,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     if (!addrIsStatic)
                         MOV(32, rdMapped, R(RSCRATCH3));
 
+                    PushRegs(false);
+
                     ABI_CallFunction((void(*)())func);
 
                     PopRegs(false);
-- 
cgit v1.2.3


From e335a8ca7615c702cfa2dcdb71deb69468088fd8 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jun 2020 21:04:25 +0200
Subject: first steps in bringing over the JIT refactor/fastmem

---
 src/ARM.cpp                         |  43 +-
 src/ARM.h                           |  15 +-
 src/ARMJIT.cpp                      | 771 ++++++++++-----------------------
 src/ARMJIT.h                        |  64 +--
 src/ARMJIT_A64/ARMJIT_ALU.cpp       | 123 +++++-
 src/ARMJIT_A64/ARMJIT_Branch.cpp    |  99 ++---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  | 383 ++++++++++++-----
 src/ARMJIT_A64/ARMJIT_Compiler.h    |  71 +++-
 src/ARMJIT_A64/ARMJIT_Linkage.s     |  68 +++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 790 ++++++++++++++++------------------
 src/ARMJIT_Compiler.h               |  12 +
 src/ARMJIT_Internal.h               |  70 +--
 src/ARMJIT_Memory.cpp               | 822 ++++++++++++++++++++++++++++++++++++
 src/ARMJIT_Memory.h                 |  53 +++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  92 +---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  11 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  45 +-
 src/ARM_InstrInfo.cpp               |  73 ++--
 src/ARM_InstrInfo.h                 |   1 +
 src/CMakeLists.txt                  |   6 +-
 src/CP15.cpp                        |  84 ++--
 src/Config.cpp                      |   6 +-
 src/Config.h                        |   1 +
 src/NDS.cpp                         | 220 +++++-----
 src/NDS.h                           |  17 +-
 25 files changed, 2342 insertions(+), 1598 deletions(-)
 create mode 100644 src/ARMJIT_A64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_Compiler.h
 create mode 100644 src/ARMJIT_Memory.cpp
 create mode 100644 src/ARMJIT_Memory.h

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 92a3a9e..e529be8 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,6 +21,8 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMInterpreter.h"
+#include "ARMJIT.h"
+#include "Config.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
 #include "Config.h"
@@ -74,7 +76,9 @@ ARM::~ARM()
 
 ARMv5::ARMv5() : ARM(0)
 {
-    //
+#ifndef JIT_ENABLED
+    DTCM = new u8[DTCMSize];
+#endif
 }
 
 ARMv4::ARMv4() : ARM(1)
@@ -82,6 +86,13 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
+ARMv5::~ARMv5()
+{
+#ifndef JIT_ENABLED
+    delete[] DTCM;
+#endif
+}
+
 void ARM::Reset()
 {
     Cycles = 0;
@@ -622,24 +633,26 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr);
-        if (!translatedAddr)
+
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        // hack so Cycles <= 0 becomes Cycles < 0
-        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
-
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr);
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
+        NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1;
 
         if (StopExecution)
         {
@@ -766,23 +779,25 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr);
-        if (!translatedAddr)
+
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
-
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr);
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
+        NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1;
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index b1e8053..b7f16d6 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -32,11 +32,14 @@ enum
     RWFlags_ForceUser = (1<<21),
 };
 
+const u32 ITCMPhysicalSize = 0x8000;
+const u32 DTCMPhysicalSize = 0x4000;
+
 class ARM
 {
 public:
     ARM(u32 num);
-    ~ARM(); // destroy shit
+    virtual ~ARM(); // destroy shit
 
     virtual void Reset();
 
@@ -143,6 +146,11 @@ public:
 
     NDS::MemRegion CodeMem;
 
+#ifdef JIT_ENABLED
+    u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0;
+    u64* FastBlockLookup;
+#endif
+
     static u32 ConditionTable[16];
 
 protected:
@@ -158,6 +166,7 @@ class ARMv5 : public ARM
 {
 public:
     ARMv5();
+    ~ARMv5();
 
     void Reset();
 
@@ -260,8 +269,8 @@ public:
     u32 DTCMBase, DTCMSize;
     s32 RegionCodeCycles;
 
-    u8 ITCM[0x8000];
-    u8 DTCM[0x4000];
+    u8 ITCM[ITCMPhysicalSize];
+    u8* DTCM;
 
     u8 ICache[0x2000];
     u32 ICacheTags[64*4];
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 8d87c76..53b28c1 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -10,13 +10,8 @@
 #include "Config.h"
 
 #include "ARMJIT_Internal.h"
-#if defined(__x86_64__)
-#include "ARMJIT_x64/ARMJIT_Compiler.h"
-#elif defined(__aarch64__)
-#include "ARMJIT_A64/ARMJIT_Compiler.h"
-#else
-#error "The current target platform doesn't have a JIT backend"
-#endif
+#include "ARMJIT_Memory.h"
+#include "ARMJIT_Compiler.h"
 
 #include "ARMInterpreter_ALU.h"
 #include "ARMInterpreter_LoadStore.h"
@@ -29,6 +24,11 @@
 #include "Wifi.h"
 #include "NDSCart.h"
 
+#include "ARMJIT_x64/ARMJIT_Offsets.h"
+static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset);
+static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset);
+static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset);
+
 namespace ARMJIT
 {
 
@@ -37,281 +37,100 @@ namespace ARMJIT
 
 Compiler* JITCompiler;
 
-const u32 ExeMemRegionSizes[] =
-{
-	0x8000,			// Unmapped Region (dummy)
-	0x8000, 		// ITCM
-	4*1024*1024, 	// Main RAM
-	0x8000, 		// SWRAM
-	0xA4000, 		// LCDC
-	0x8000, 		// ARM9 BIOS
-	0x4000, 		// ARM7 BIOS
-	0x10000,		// ARM7 WRAM
-	0x40000			// ARM7 WVRAM
-};
-
-const u32 ExeMemRegionOffsets[] =
-{
-	0,
-	0x8000,
-	0x10000,
-	0x410000,
-	0x418000,
-	0x4BC000,
-	0x4C4000,
-	0x4C8000,
-	0x4D8000,
-	0x518000,
-};
-
-/*
-	translates address to pseudo physical address
-		- more compact, eliminates mirroring, everything comes in a row
-		- we only need one translation table
-*/
-
-u32 TranslateAddr9(u32 addr)
-{
-	switch (ClassifyAddress9(addr))
-	{
-	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
-	case memregion_SWRAM9:
-		if (NDS::SWRAM_ARM9)
-			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask);
-		else
-			return 0;
-	case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF);
-	case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0;
-	case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF);
-	default: return 0;
-	}
-}
-
-u32 TranslateAddr7(u32 addr)
-{
-	switch (ClassifyAddress7(addr))
-	{
-	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
-	case memregion_SWRAM7:
-		if (NDS::SWRAM_ARM7)
-			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask);
-		else
-			return 0;
-	case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr;
-	case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF);
-	case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF);
-	default: return 0;
-	}
-}
-
-AddressRange CodeRanges[ExeMemSpaceSize / 512];
-
-TinyVector<u32> InvalidLiterals;
+AddressRange CodeIndexITCM[ITCMPhysicalSize / 512];
+AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512];
+AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512];
+AddressRange CodeIndexVRAM[0x100000 / 512];
+AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512];
+AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512];
+AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512];
+AddressRange CodeIndexARM7WVRAM[0x40000 / 512];
 
 std::unordered_map<u32, JitBlock*> JitBlocks9;
 std::unordered_map<u32, JitBlock*> JitBlocks7;
 
-u8 MemoryStatus9[0x800000];
-u8 MemoryStatus7[0x800000];
+u64 FastBlockLookupITCM[ITCMPhysicalSize / 2];
+u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2];
+u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2];
+u64 FastBlockLookupVRAM[0x100000 / 2];
+u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2];
+u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2];
+u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2];
+u64 FastBlockLookupARM7WVRAM[0x40000 / 2];
 
-int ClassifyAddress9(u32 addr)
+const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 {
-	if (addr < NDS::ARM9->ITCMSize)
-		return memregion_ITCM;
-	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
-		return memregion_DTCM;
-	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
-		return memregion_BIOS9;
-	else
-	{
-		switch (addr & 0xFF000000)
-		{
-		case 0x02000000:
-			return memregion_MainRAM;
-		case 0x03000000:
-			return memregion_SWRAM9;
-		case 0x04000000:
-			return memregion_IO9;
-		case 0x06000000:
-			return memregion_VRAM;
-		}
-	}
-	return memregion_Other;
-}
+	0,
+	ITCMPhysicalSize,
+	0,
+	sizeof(NDS::ARM9BIOS),
+	NDS::MainRAMSize,
+	NDS::SharedWRAMSize,
+	0,
+	0x100000,
+	sizeof(NDS::ARM7BIOS),
+	NDS::ARM7WRAMSize,
+	0,
+	0,
+	0x40000,
+};
 
-int ClassifyAddress7(u32 addr)
+AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
 {
-	if (addr < 0x00004000)
-		return memregion_BIOS7;
-	else
-	{
-		switch (addr & 0xFF800000)
-		{
-		case 0x02000000:
-		case 0x02800000:
-			return memregion_MainRAM;
-		case 0x03000000:
-			if (NDS::SWRAM_ARM7)
-				return memregion_SWRAM7;
-			else
-				return memregion_WRAM7;
-		case 0x03800000:
-			return memregion_WRAM7;
-		case 0x04000000:
-			return memregion_IO7;
-		case 0x04800000:
-			return memregion_Wifi;
-		case 0x06000000:
-		case 0x06800000:
-			return memregion_VWRAM;
-		}
-	}
-	return memregion_Other;
-}
+	NULL,
+	CodeIndexITCM,
+	NULL,
+	CodeIndexARM9BIOS,
+	CodeIndexMainRAM,
+	CodeIndexSWRAM,
+	NULL,
+	CodeIndexVRAM,
+	CodeIndexARM7BIOS,
+	CodeIndexARM7WRAM,
+	NULL,
+	NULL,
+	CodeIndexARM7WVRAM,
+};
 
-void UpdateMemoryStatus9(u32 start, u32 end)
+u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
 {
-	start >>= 12;
-	end >>= 12;
-
-	if (end == 0xFFFFF)
-		end++;
-
-	for (u32 i = start; i < end; i++)
-	{
-		u32 addr = i << 12;
-
-		int region = ClassifyAddress9(addr);
-		u32 pseudoPhyisical = TranslateAddr9(addr);
-
-		for (u32 j = 0; j < 8; j++)
-		{
-			u8 val = region;
-			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
-				val |= 0x80;
-			MemoryStatus9[i * 8 + j] = val;
-		}
-	}
-}
+	NULL,
+	FastBlockLookupITCM,
+	NULL,
+	FastBlockLookupARM9BIOS,
+	FastBlockLookupMainRAM,
+	FastBlockLookupSWRAM,
+	NULL,
+	FastBlockLookupVRAM,
+	FastBlockLookupARM7BIOS,
+	FastBlockLookupARM7WRAM,
+	NULL,
+	NULL,
+	FastBlockLookupARM7WVRAM
+};
 
-void UpdateMemoryStatus7(u32 start, u32 end)
+u32 LocaliseCodeAddress(u32 num, u32 addr)
 {
-	start >>= 12;
-	end >>= 12;
-
-	if (end == 0xFFFFF)
-		end++;
-
-	for (u32 i = start; i < end; i++)
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(addr)
+		: ARMJIT_Memory::ClassifyAddress7(addr);
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
+		mappingSize, memoryOffset, memorySize)
+		&& CodeMemRegions[region])
 	{
-		u32 addr = i << 12;
-
-		int region = ClassifyAddress7(addr);
-		u32 pseudoPhyisical = TranslateAddr7(addr);
-
-		for (u32 j = 0; j < 8; j++)
-		{
-			u8 val = region;
-			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
-				val |= 0x80;
-			MemoryStatus7[i * 8 + j] = val;
-		}
+		addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
+		addr |= (u32)region << 28;
+		return addr;
 	}
+	return 0;
 }
 
-void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate)
-{
-	for (u32 i = 1; i < exeMem_Count; i++)
-	{
-		if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i])
-		{
-			for (u32 num = 0; num < 2; num++)
-			{
-				u32 physSize = ExeMemRegionSizes[i];
-				u32 mapSize = 0;
-				u32 mapStart = 0;
-				switch (i)
-				{
-				case exeMem_ITCM:
-					if (num == 0)
-						mapStart = 0; mapSize = NDS::ARM9->ITCMSize;
-					break;
-				case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break;
-				case exeMem_SWRAM:
-					if (num == 0)
-					{
-						if (NDS::SWRAM_ARM9)
-							mapStart = 0x3000000, mapSize = 0x1000000;
-						else
-							mapStart = mapSize = 0;
-					}
-					else
-					{
-						if (NDS::SWRAM_ARM7)
-							mapStart = 0x3000000, mapSize = 0x800000;
-						else
-							mapStart = mapSize = 0;
-					}
-					break;
-				case exeMem_LCDC:
-					if (num == 0)
-						mapStart = 0x6800000, mapSize = 0xA4000;
-					break;
-				case exeMem_ARM9_BIOS:
-					if (num == 0)
-						mapStart = 0xFFFF0000, mapSize = 0x10000;
-					break;
-				case exeMem_ARM7_BIOS:
-					if (num == 1)
-						mapStart = 0; mapSize = 0x4000;
-					break;
-				case exeMem_ARM7_WRAM:
-					if (num == 1)
-					{
-						if (NDS::SWRAM_ARM7)
-							mapStart = 0x3800000, mapSize = 0x800000;
-						else
-							mapStart = 0x3000000, mapSize = 0x1000000;
-					}
-					break;
-				case exeMem_ARM7_WVRAM:
-					if (num == 1)
-						mapStart = 0x6000000, mapSize = 0x1000000;
-					break;
-				}
-
-				for (u32 j = 0; j < mapSize / physSize; j++)
-				{
-					u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]);
-					if (num == 0
-						&& virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
-						continue;
-					if (invalidate)
-					{
-						if (num == 0)
-							MemoryStatus9[virtAddr / 512] |= 0x80;
-						else
-							MemoryStatus7[virtAddr / 512] |= 0x80;
-					}
-					else
-					{
-						if (num == 0)
-							MemoryStatus9[virtAddr / 512] &= ~0x80;
-						else
-							MemoryStatus7[virtAddr / 512] &= ~0x80;
-					}
-				}
-				
-			}
-			return;
-		}
-	}
-
-	assert(false);
-}
+TinyVector<u32> InvalidLiterals;
 
 template <typename T>
-T SlowRead9(ARMv5* cpu, u32 addr)
+T SlowRead9(u32 addr, ARMv5* cpu)
 {
 	u32 offset = addr & 0x3;
 	addr &= ~(sizeof(T) - 1);
@@ -335,13 +154,13 @@ T SlowRead9(ARMv5* cpu, u32 addr)
 }
 
 template <typename T>
-void SlowWrite9(ARMv5* cpu, u32 addr, T val)
+void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 {
 	addr &= ~(sizeof(T) - 1);
 
     if (addr < cpu->ITCMSize)
 	{
-		InvalidateITCMIfNecessary(addr);
+        CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 		*(T*)&cpu->ITCM[addr & 0x7FFF] = val;
 	}
 	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
@@ -362,13 +181,13 @@ void SlowWrite9(ARMv5* cpu, u32 addr, T val)
 	}
 }
 
-template void SlowWrite9<u32>(ARMv5*, u32, u32);
-template void SlowWrite9<u16>(ARMv5*, u32, u16);
-template void SlowWrite9<u8>(ARMv5*, u32, u8);
+template void SlowWrite9<u32>(u32, ARMv5*, u32);
+template void SlowWrite9<u16>(u32, ARMv5*, u16);
+template void SlowWrite9<u8>(u32, ARMv5*, u8);
 
-template u32 SlowRead9<u32>(ARMv5*, u32);
-template u16 SlowRead9<u16>(ARMv5*, u32);
-template u8 SlowRead9<u8>(ARMv5*, u32);
+template u32 SlowRead9<u32>(u32, ARMv5*);
+template u16 SlowRead9<u16>(u32, ARMv5*);
+template u8 SlowRead9<u8>(u32, ARMv5*);
 
 template <typename T>
 T SlowRead7(u32 addr)
@@ -407,14 +226,15 @@ template <bool PreInc, bool Write>
 void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
 {
 	addr &= ~0x3;
+	if (PreInc)
+		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
-		addr += PreInc * 4;
 		if (Write)
-			SlowWrite9<u32>(cpu, addr, data[i]);
+			SlowWrite9<u32>(addr, cpu, data[i]);
 		else
-			data[i] = SlowRead9<u32>(cpu, addr);
-		addr += !PreInc * 4;
+			data[i] = SlowRead9<u32>(addr, cpu);
+		addr += 4;
 	}
 }
 
@@ -422,14 +242,15 @@ template <bool PreInc, bool Write>
 void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
 {
 	addr &= ~0x3;
+	if (PreInc)
+		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
-		addr += PreInc * 4;
 		if (Write)
 			SlowWrite7<u32>(addr, data[i]);
 		else
 			data[i] = SlowRead7<u32>(addr);
-		addr += !PreInc * 4;
+		addr += 4;
 	}
 }
 
@@ -540,16 +361,18 @@ struct UnreliableHashTable
 };
 
 UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
-UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9;
-UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7;
 
 void Init()
 {
 	JITCompiler = new Compiler();
+
+	ARMJIT_Memory::Init();
 }
 
 void DeInit()
 {
+	ARMJIT_Memory::DeInit();
+
 	delete JITCompiler;
 }
 
@@ -557,8 +380,7 @@ void Reset()
 {
 	ResetBlockCache();
 
-	UpdateMemoryStatus9(0, 0xFFFFFFFF);
-	UpdateMemoryStatus7(0, 0xFFFFFFFF);
+	ARMJIT_Memory::Reset();
 }
 
 void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
@@ -673,11 +495,12 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
 	// it basically checks if one iteration of a loop depends on another
 	// the rules are quite simple
 
+	JIT_DEBUGPRINT("checking potential idle loop\n");
 	u16 regsWrittenTo = 0;
 	u16 regsDisallowedToWrite = 0;
 	for (int i = 0; i < instrsCount; i++)
 	{
-		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
 		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
 			return false;
 		if (i < instrsCount - 1 && instrs[i].Info.Branches())
@@ -782,8 +605,6 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 };
 #undef F
 
-
-extern u32 literalsPerBlock;
 void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -794,14 +615,28 @@ void CompileBlock(ARM* cpu)
 		Config::JIT_MaxBlockSize = 32;
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
-	u32 pseudoPhysicalAddr = cpu->Num == 0
-			? TranslateAddr9(blockAddr)
-			: TranslateAddr7(blockAddr);
-    if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped])
-    {
-        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
-    }
-	
+
+	auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7;
+	auto existingBlockIt = map.find(blockAddr);
+	if (existingBlockIt != map.end())
+	{
+		// there's already a block, though it's not inside the fast map
+		// could be that there are two blocks at the same physical addr
+		// but different mirrors
+		u32 localAddr = existingBlockIt->second->StartAddrLocal;
+
+		u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF];
+		*entry = ((u64)blockAddr | cpu->Num) << 32;
+		*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
+		return;
+	}
+
+	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
+	if (!localAddr)
+	{
+		printf("trying to compile non executable code? %x\n", blockAddr);
+	}
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
     u32 r15 = cpu->R[15];
@@ -842,9 +677,8 @@ void CompileBlock(ARM* cpu)
 
 		instrValues[i] = instrs[i].Instr;
 
-		u32 translatedAddr = cpu->Num == 0
-			? TranslateAddr9(instrs[i].Addr)
-			: TranslateAddr7(instrs[i].Addr);
+		u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr);
+		assert(translatedAddr);
 		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
@@ -928,9 +762,11 @@ void CompileBlock(ARM* cpu)
 			&& instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral
 			&& DecodeLiteral(thumb, instrs[i], literalAddr))
 		{
-			u32 translatedAddr = cpu->Num == 0
-				? TranslateAddr9(literalAddr)
-				: TranslateAddr7(literalAddr);
+			u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr);
+			if (!translatedAddr)
+			{
+				printf("literal in non executable memory?\n");
+			}
 			u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 
 			u32 j = 0;
@@ -994,9 +830,7 @@ void CompileBlock(ARM* cpu)
 				}
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
-					u32 targetPseudoPhysical = cpu->Num == 0
-						? TranslateAddr9(target)
-						: TranslateAddr7(target);
+					u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target);
 
 					if (link)
 					{
@@ -1048,7 +882,7 @@ void CompileBlock(ARM* cpu)
 	{
 		RestoreCandidates.Remove(instrHash);
 
-		mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash;
+		mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash;
 
 		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
@@ -1087,11 +921,12 @@ void CompileBlock(ARM* cpu)
 		for (int j = 0; j < numLiterals; j++)
 			block->Literals()[j] = literalLoadAddrs[j];
 
-		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+		block->StartAddr = blockAddr;
+		block->StartAddrLocal = localAddr;
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
+		block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -1104,30 +939,34 @@ void CompileBlock(ARM* cpu)
 		assert(addressRanges[j] == block->AddressRanges()[j]);
 		assert(addressMasks[j] == block->AddressMasks()[j]);
 		assert(addressMasks[j] != 0);
-		CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j];
-		CodeRanges[addressRanges[j] / 512].Blocks.Add(block);
 
-		UpdateRegionByPseudoPhyiscal(addressRanges[j], true);
+		AddressRange* region = CodeMemRegions[addressRanges[j] >> 28];
+
+		if (!PageContainsCode(&region[(addressRanges[j] & 0xFFFF000) / 512]))
+			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true);
+
+		AddressRange* range = &region[(addressRanges[j] & 0xFFFFFFF) / 512];
+		range->Code |= addressMasks[j];
+		range->Blocks.Add(block);
 	}
 
 	if (cpu->Num == 0)
-	{
-		JitBlocks9[pseudoPhysicalAddr] = block;
-		FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
-	}
+		JitBlocks9[blockAddr] = block;
 	else
-	{
-		JitBlocks7[pseudoPhysicalAddr] = block;
-		FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
-	}
+		JitBlocks7[blockAddr] = block;
+
+	u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2];
+	*entry = ((u64)blockAddr | cpu->Num) << 32;
+	*entry |= JITCompiler->SubEntryOffset(block->EntryPoint);
 }
 
-void InvalidateByAddr(u32 pseudoPhysical)
+void InvalidateByAddr(u32 localAddr)
 {
-	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr);
 
-	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
-	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
+	AddressRange* region = CodeMemRegions[localAddr >> 28];
+	AddressRange* range = &region[(localAddr & 0xFFFFFFF) / 512];
+	u32 mask = 1 << ((localAddr & 0x1FF) / 16);
 
 	range->Code = 0;
 	for (int i = 0; i < range->Blocks.Length;)
@@ -1138,7 +977,7 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		u32 mask = 0;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
-			if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF))
+			if (block->AddressRanges()[j] == (localAddr & ~0x1FF))
 			{
 				mask = block->AddressMasks()[j];
 				invalidated = block->AddressMasks()[j] & mask;
@@ -1154,15 +993,21 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		}
 		range->Blocks.Remove(i);
 
+		if (range->Blocks.Length == 0
+			&& !PageContainsCode(&region[(localAddr & 0xFFFF000) / 512]))
+		{
+			ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false);
+		}
+
 		bool literalInvalidation = false;
 		for (int j = 0; j < block->NumLiterals; j++)
 		{
 			u32 addr = block->Literals()[j];
-			if (addr == pseudoPhysical)
+			if (addr == localAddr)
 			{
-				if (InvalidLiterals.Find(pseudoPhysical) != -1)
+				if (InvalidLiterals.Find(localAddr) != -1)
 				{
-					InvalidLiterals.Add(pseudoPhysical);
+					InvalidLiterals.Add(localAddr);
 					JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length);
 				}
 				literalInvalidation = true;
@@ -1172,35 +1017,30 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			if ((addr / 512) != (pseudoPhysical / 512))
+			if ((addr / 512) != (localAddr / 512))
 			{
-				AddressRange* otherRange = &CodeRanges[addr / 512];
+				AddressRange* otherRegion = CodeMemRegions[addr >> 28];
+				AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512];
 				assert(otherRange != range);
+
 				bool removed = otherRange->Blocks.RemoveByValue(block);
 				assert(removed);
 
 				if (otherRange->Blocks.Length == 0)
 				{
+					if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512]))
+						ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false);
+
 					otherRange->Code = 0;
-					UpdateRegionByPseudoPhyiscal(addr, false);
 				}
 			}
 		}
 
-		for (int j = 0; j < block->NumLinks(); j++)
-			JITCompiler->UnlinkBlock(block->Links()[j]);
-		block->ResetLinks();
-
+		FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32;
 		if (block->Num == 0)
-		{
-			JitBlocks9.erase(block->PseudoPhysicalAddr);
-			FastBlockLookUp9.Remove(block->PseudoPhysicalAddr);
-		}
+			JitBlocks9.erase(block->StartAddr);
 		else
-		{
-			JitBlocks7.erase(block->PseudoPhysicalAddr);
-			FastBlockLookUp7.Remove(block->PseudoPhysicalAddr);
-		}
+			JitBlocks7.erase(block->StartAddr);
 
 		if (!literalInvalidation)
 		{
@@ -1213,24 +1053,66 @@ void InvalidateByAddr(u32 pseudoPhysical)
 			delete block;
 		}
 	}
+}
 
-	if (range->Blocks.Length == 0)
-		UpdateRegionByPseudoPhyiscal(pseudoPhysical, false);
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr)
+{
+	// let's hope this gets all properly inlined
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize))
+	{
+		u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
+		if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
+			InvalidateByAddr(localAddr | (region << 28));
+	}
+}
+
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
+{
+	u64* entry = &entries[offset / 2];
+	if (*entry >> 32 == (addr | num))
+		return JITCompiler->AddEntryOffset((u32)*entry);
+	return NULL;
 }
 
-void InvalidateRegionIfNecessary(u32 pseudoPhyisical)
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size)
 {
-	if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16)))
-		InvalidateByAddr(pseudoPhyisical);
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(blockAddr)
+		: ARMJIT_Memory::ClassifyAddress7(blockAddr);
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (CodeMemRegions[region]
+		&& ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
+			mappingSize, memoryOffset, memorySize))
+	{
+		entry = FastBlockLookupRegions[region] + memoryOffset / 2;
+		// evil, though it should work for everything except DTCM which is not relevant here
+		start = blockAddr & ~(memorySize - 1);
+		size = memorySize;
+		return true;
+	}
+	else
+		return false;
 }
 
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32);
+
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
 
 	InvalidLiterals.Clear();
-	FastBlockLookUp9.Reset();
-	FastBlockLookUp7.Reset();
+	for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++)
+		memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2);
 	RestoreCandidates.Reset();
 	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
@@ -1251,8 +1133,9 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].Code = 0;
+			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
 		}
 		delete block;
 	}
@@ -1262,8 +1145,9 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].Code = 0;
+			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
 		}
 	}
 	JitBlocks9.clear();
@@ -1272,191 +1156,4 @@ void ResetBlockCache()
 	JITCompiler->Reset();
 }
 
-template <u32 Num>
-JitBlockEntry LookUpBlockEntry(u32 addr)
-{
-	auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7;
-	u32 entryOffset = fastMap.LookUp(addr);
-	if (entryOffset != UINT32_MAX)
-		return JITCompiler->AddEntryOffset(entryOffset);
-
-	auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7;
-	auto block = slowMap.find(addr);
-	if (block != slowMap.end())
-	{
-		fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint));
-		return block->second->EntryPoint;
-	}
-	return NULL;
-}
-
-template JitBlockEntry LookUpBlockEntry<0>(u32);
-template JitBlockEntry LookUpBlockEntry<1>(u32);
-
-template <u32 Num>
-void LinkBlock(ARM* cpu, u32 codeOffset)
-{
-	auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7;
-	u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4);
-	u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr);
-	auto block = blockMap.find(targetPseudoPhys);
-	if (block == blockMap.end())
-	{
-		CompileBlock(cpu);
-		block = blockMap.find(targetPseudoPhys);
-	}
-
-	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
-
-	block->second->AddLink(codeOffset);
-	JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint);
-}
-
-template void LinkBlock<0>(ARM*, u32);
-template void LinkBlock<1>(ARM*, u32);
-
-void WifiWrite32(u32 addr, u32 val)
-{
-	Wifi::Write(addr, val & 0xFFFF);
-	Wifi::Write(addr + 2, val >> 16);
-}
-
-u32 WifiRead32(u32 addr)
-{
-	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
-}
-
-template <typename T>
-void VRAMWrite(u32 addr, T val)
-{
-	switch (addr & 0x00E00000)
-	{
-	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
-	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
-	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
-	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
-	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
-	}
-}
-template <typename T>
-T VRAMRead(u32 addr)
-{
-	switch (addr & 0x00E00000)
-	{
-	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
-	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
-	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
-	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
-	default: return GPU::ReadVRAM_LCDC<T>(addr);
-	}
-}
-
-void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
-{
-	if (cpu->Num == 0)
-	{
-		switch (addr & 0xFF000000)
-		{
-		case 0x04000000:
-			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
-				return (void*)NDSCart::ReadROMData;
-
-			/*
-				unfortunately we can't map GPU2D this way
-				since it's hidden inside an object
-
-				though GPU3D registers are accessed much more intensive
-			*/
-			if (addr >= 0x04000320 && addr < 0x040006A4)
-			{
-				switch (size | store)
-				{
-				case 8: return (void*)GPU3D::Read8;		
-				case 9: return (void*)GPU3D::Write8;		
-				case 16: return (void*)GPU3D::Read16;
-				case 17: return (void*)GPU3D::Write16;
-				case 32: return (void*)GPU3D::Read32;
-				case 33: return (void*)GPU3D::Write32;
-				}
-			}
-
-			switch (size | store)
-			{
-			case 8: return (void*)NDS::ARM9IORead8;
-			case 9: return (void*)NDS::ARM9IOWrite8;
-			case 16: return (void*)NDS::ARM9IORead16;
-			case 17: return (void*)NDS::ARM9IOWrite16;
-			case 32: return (void*)NDS::ARM9IORead32;
-			case 33: return (void*)NDS::ARM9IOWrite32;
-			}
-			break;
-		case 0x06000000:
-			switch (size | store)
-			{
-			case 8: return (void*)VRAMRead<u8>;		
-			case 9: return NULL;
-			case 16: return (void*)VRAMRead<u16>;
-			case 17: return (void*)VRAMWrite<u16>;
-			case 32: return (void*)VRAMRead<u32>;
-			case 33: return (void*)VRAMWrite<u32>;
-			}
-			break;
-		}
-	}
-	else
-	{
-		switch (addr & 0xFF800000)
-		{
-		case 0x04000000:
-			if (addr >= 0x04000400 && addr < 0x04000520)
-			{
-				switch (size | store)
-				{
-				case 8: return (void*)SPU::Read8;		
-				case 9: return (void*)SPU::Write8;		
-				case 16: return (void*)SPU::Read16;
-				case 17: return (void*)SPU::Write16;
-				case 32: return (void*)SPU::Read32;
-				case 33: return (void*)SPU::Write32;
-				}
-			}
-
-			switch (size | store)
-			{
-			case 8: return (void*)NDS::ARM7IORead8;
-			case 9: return (void*)NDS::ARM7IOWrite8;		
-			case 16: return (void*)NDS::ARM7IORead16;
-			case 17: return (void*)NDS::ARM7IOWrite16;
-			case 32: return (void*)NDS::ARM7IORead32;
-			case 33: return (void*)NDS::ARM7IOWrite32;
-			}
-			break;
-		case 0x04800000:
-			if (addr < 0x04810000 && size >= 16)
-			{
-				switch (size | store)
-				{
-				case 16: return (void*)Wifi::Read;
-				case 17: return (void*)Wifi::Write;
-				case 32: return (void*)WifiRead32;
-				case 33: return (void*)WifiWrite32;
-				}
-			}
-			break;
-		case 0x06000000:
-		case 0x06800000:
-			switch (size | store)
-			{
-			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
-			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
-			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
-			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
-			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
-			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
-			}
-		}
-	}
-	return NULL;
-}
-
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 44a6140..2320b7b 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,32 +9,7 @@
 namespace ARMJIT
 {
 
-enum ExeMemKind
-{
-	exeMem_Unmapped = 0,
-	exeMem_ITCM,
-	exeMem_MainRAM,
-	exeMem_SWRAM,
-	exeMem_LCDC,
-	exeMem_ARM9_BIOS,
-	exeMem_ARM7_BIOS,
-	exeMem_ARM7_WRAM,
-	exeMem_ARM7_WVRAM,
-	exeMem_Count
-};
-
-extern const u32 ExeMemRegionOffsets[];
-extern const u32 ExeMemRegionSizes[];
-
-typedef u32 (*JitBlockEntry)();
-
-const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-
-u32 TranslateAddr9(u32 addr);
-u32 TranslateAddr7(u32 addr);
-
-template <u32 Num>
-JitBlockEntry LookUpBlockEntry(u32 addr);
+typedef void (*JitBlockEntry)();
 
 void Init();
 void DeInit();
@@ -43,44 +18,15 @@ void Reset();
 
 void InvalidateByAddr(u32 pseudoPhysical);
 
-void InvalidateRegionIfNecessary(u32 addr);
-
-inline void InvalidateMainRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)));
-}
-inline void InvalidateITCMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF));
-}
-inline void InvalidateLCDCIfNecessary(u32 addr)
-{
-	if (addr < 0x68A3FFF)
-		InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000));
-}
-inline void InvalidateSWRAM7IfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask));
-}
-inline void InvalidateSWRAM9IfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask));
-}
-inline void InvalidateARM7WRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF));
-}
-inline void InvalidateARM7WVRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF));
-}
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr);
 
 void CompileBlock(ARM* cpu);
 
 void ResetBlockCache();
 
-void UpdateMemoryStatus9(u32 start, u32 end);
-void UpdateMemoryStatus7(u32 start, u32 end);
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr);
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size);
 
 }
 
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
index 0fe6a97..5f021a0 100644
--- a/src/ARMJIT_A64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
     if (S && !CurInstr.SetFlags)
         S = false;
 
-    bool CVInGP = false;
+    bool CVInGPR = false;
     switch (op)
     {
     case 0x2: // SUB
@@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
         UBFX(W2, RCPSR, 29, 1);
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, rn, W2);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
             ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, W2, W1);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
         MVN(W1, rn);
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, W2, W1);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
 
     if (S)
     {
-        if (CVInGP)
+        if (CVInGPR)
         {
             BFI(RCPSR, W2, 29, 1);
             BFI(RCPSR, W3, 28, 1);
         }
-        Comp_RetriveFlags(!CVInGP);
+        Comp_RetriveFlags(!CVInGPR);
     }
 }
 
@@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp()
             MOVI2R(rd, op2.Imm);
         }
         else
-            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+        {
+            // ORR with shifted operand has cycles latency
+            if (op2.Reg.ShiftAmount > 0)
+            {
+                switch (op2.Reg.ShiftType)
+                {
+                case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                }
+            }
+            else
+            {
+                MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            }
+        }
     }
 
     if (S)
@@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg
     }
     else
     {
-        CLZ(W0, rs);
-        CLS(W1, rs);
-        CMP(W0, W1);
-        CSEL(W0, W0, W1, CC_GT);
+        CLS(W0, rs);
         Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
     }
 
@@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long()
     }
     else
     {
-        CLZ(W0, rs);
-        CLS(W1, rs);
-        CMP(W0, W1);
-        CSEL(W0, W0, W1, CC_GT);
+        if (sign)
+            CLS(W0, rs);
+        else
+            CLZ(W0, rs);
         Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
     }
 
@@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long()
         Comp_RetriveFlags(false);
 }
 
+void Compiler::A_Comp_Mul_Short()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    bool x = CurInstr.Instr & (1 << 5);
+    bool y = CurInstr.Instr & (1 << 6);
+
+    SBFX(W1, rs, y ? 16 : 0, 16);
+
+    if (op == 0b1000)
+    {
+        // SMLAxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(W0, W0, W1);
+
+        ORRI2R(W1, RCPSR, 0x08000000);
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+        ADDS(rd, W0, rn);
+
+        CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+        CPSRDirty = true;
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1011)
+    {
+        // SMULxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(rd, W0, W1);
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1010)
+    {
+        // SMLALxy
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+        MOV(W2, rn);
+        BFI(X2, rd, 32, 32);
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        SMADDL(EncodeRegTo64(rn), W0, W1, X2);
+
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+
+        Comp_AddCycles_CI(1);
+    }
+    else if (op == 0b1001)
+    {
+        // SMLAWy/SMULWy
+        SMULL(X0, rm, W1);
+        ASR(x ? EncodeRegTo64(rd) : X0, X0, 16);
+
+        if (!x)
+        {
+            ORRI2R(W1, RCPSR, 0x08000000);
+
+            ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+            ADDS(rd, W0, rn);
+
+            CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+            CPSRDirty = true;
+        }
+
+        Comp_AddCycles_C();
+    }
+}
+
 void Compiler::A_Comp_Mul()
 {
     ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
index 542f0b7..f130938 100644
--- a/src/ARMJIT_A64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 
@@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind)
     AlignCode16();
     void* res = GetRXPtr();
 
-    MOVI2R(W2, kCodeCacheTiming);
-    // W1 - code cycles non branch
-    // W2 - branch code cycles
     LSR(W1, W0, 12);
-    LSL(W1, W1, 2);
     ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
     LDRB(W1, RCPU, W1);
 
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+    LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize));
 
     STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
 
-    CMP(W0, W3);
-    FixupBranch outsideITCM = B(CC_LO);
-    MOVI2R(W1, 1);
-    MOVI2R(W2, 1);
-    SetJumpTarget(outsideITCM);
+    CMP(W1, 0xFF);
+    MOVI2R(W3, kCodeCacheTiming);
+    CSEL(W1, W3, W1, CC_EQ);
+    CMP(W0, W2);
+    CSINC(W1, W1, WZR, CC_HS);
 
     FixupBranch switchToThumb;
     if (kind == 0)
@@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind)
 
     if (kind == 0 || kind == 1)
     {
-        ANDI2R(W0, W0, ~3);
-
+        // ARM
         if (kind == 0)
             ANDI2R(RCPSR, RCPSR, ~0x20);
 
-        ADD(W3, W0, 4);
-        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
-        ADD(W1, W1, W2);
-        ADD(RCycles, RCycles, W1);
+        ANDI2R(W0, W0, ~3);
+        ADD(W0, W0, 4);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
 
+        ADD(W1, W1, W1);
+        SUB(RCycles, RCycles, W1);
         RET();
     }
+
     if (kind == 0 || kind == 2)
     {
+        // Thumb
         if (kind == 0)
         {
             SetJumpTarget(switchToThumb);
-
             ORRI2R(RCPSR, RCPSR, 0x20);
         }
 
         ANDI2R(W0, W0, ~1);
+        ADD(W0, W0, 2);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
 
-        ADD(W3, W0, 2);
-        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
-        FixupBranch halfwordLoc = TBZ(W0, 1);
-        ADD(W1, W1, W2);
-        ADD(RCycles, RCycles, W1);
-        RET();
-
-        SetJumpTarget(halfwordLoc);
-        ADD(RCycles, RCycles, W2);
+        ADD(W2, W1, W1);
+        TSTI2R(W0, 0x2);
+        CSEL(W1, W1, W2, CC_EQ);
+        SUB(RCycles, RCycles, W1);
         RET();
     }
 
@@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind)
         UBFX(W2, W3, 0, 8);
         UBFX(W3, W3, 8, 8);
         ADD(W2, W3, W2);
-        ADD(RCycles, RCycles, W2);
+        SUB(RCycles, RCycles, W2);
 
         ANDI2R(W0, W0, ~3);
 
@@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind)
         UBFX(W2, W3, 16, 8);
         UBFX(W3, W3, 24, 8);
         ADD(W2, W3, W2);
-        ADD(RCycles, RCycles, W2);
+        SUB(RCycles, RCycles, W2);
 
         ANDI2R(W0, W0, ~1);
 
@@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
     }
     else
     {
-        BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
-        bool previouslyDirty = CPSRDirty;
+        
+        bool cpsrDirty = CPSRDirty;
         SaveCPSR();
-
-        if (restoreCPSR)
-        {
-            if (Thumb || CurInstr.Cond() >= 0xE)
-                RegCache.Flush();
-            else
-            {
-                // the ugly way...
-                // we only save them, to load and save them again
-                for (int reg : hiRegsLoaded)
-                    SaveReg(reg, RegCache.Mapping[reg]);
-            }
-        }
+        SaveCycles();
+        PushRegs(restoreCPSR);
 
         if (switchThumb)
             MOV(W1, addr);
@@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
             QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
         else
             QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
-        
-        if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
-        {
-            for (int reg : hiRegsLoaded)
-                LoadReg(reg, RegCache.Mapping[reg]);
-        }
 
-        if (previouslyDirty)
-            LoadCPSR();
-        CPSRDirty = previouslyDirty;
+        PopRegs(restoreCPSR);
+        LoadCycles();
+        LoadCPSR();
+        if (CurInstr.Cond() < 0xE)
+            CPSRDirty = cpsrDirty;
     }
 }
 
@@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_BranchSpecialBehaviour();
+    Comp_BranchSpecialBehaviour(true);
 
     FixupBranch skipFailed = B();
     SetJumpTarget(skipExecute);
     Comp_AddCycles_C(true);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        SaveCPSR(false);
-        RegCache.PrepareExit();
-        
-        ADD(W0, RCycles, ConstantCycles);
-        ABI_PopRegisters(SavedRegs);
-        RET();
-    }
+    Comp_BranchSpecialBehaviour(false);
 
     SetJumpTarget(skipFailed);
 }
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index a67f357..42435ed 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -1,9 +1,3 @@
-#include "ARMJIT_Compiler.h"
-
-#include "../ARMInterpreter.h"
-
-#include "../ARMJIT_Internal.h"
-
 #ifdef __SWITCH__
 #include "../switch/compat_switch.h"
 
@@ -13,10 +7,17 @@ extern char __start__;
 #include <unistd.h>
 #endif
 
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMInterpreter.h"
+#include "../Config.h"
+
 #include <malloc.h>
 
 using namespace Arm64Gen;
 
+extern "C" void ARM_Ret();
 
 namespace ARMJIT
 {
@@ -28,7 +29,10 @@ namespace ARMJIT
     like x64. At one hand you can translate a lot of instructions directly.
     But at the same time, there are a ton of exceptions, like for
     example ADD and SUB can't have a RORed second operand on ARMv8.
- */
+ 
+    While writing a JIT when an instruction is recompiled into multiple ones
+    not to write back until you've read all the other operands!
+*/
 
 template <>
 const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
@@ -46,6 +50,132 @@ void Compiler::MovePC()
     ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
 }
 
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+        MOV(rd, W3);
+    }
+    else
+        MOV(rd, RCPSR);
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg val;
+    if (CurInstr.Instr & (1 << 25))
+    {
+        val = W0;
+        MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)));
+    }
+    else
+    {
+        val = MapReg(CurInstr.A_Reg(0));
+    }
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+
+        MOVI2R(W1, mask);
+        MOVI2R(W2, mask & 0xFFFFFF00);
+        ANDI2R(W5, RCPSR, 0x1F);
+        CMP(W5, 0x10);
+        CSEL(W1, W2, W1, CC_EQ);
+
+        BIC(W3, W3, W1);
+        AND(W0, val, W1);
+        ORR(W3, W3, W0);
+
+        MOVI2R(W1, 15 - 8);
+
+        BL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            ANDI2R(RCPSR, RCPSR, ~mask);
+            ANDI2R(W0, val, mask);
+            ORR(RCPSR, RCPSR, W0);
+        }
+        else
+        {
+            MOVI2R(W2, mask);
+            MOVI2R(W3, mask & 0xFFFFFF00);
+            ANDI2R(W1, RCPSR, 0x1F);
+            // W1 = first argument
+            CMP(W1, 0x10);
+            CSEL(W2, W3, W2, CC_EQ);
+
+            BIC(RCPSR, RCPSR, W2);
+            AND(W0, val, W2);
+            ORR(RCPSR, RCPSR, W0);
+
+            MOV(W2, RCPSR);
+            MOV(X0, RCPU);
+
+            PushRegs(true);
+
+            QuickCallFunction(X3, (void*)&ARM::UpdateMode);
+        
+            PopRegs(true);
+        }
+    }
+}
+
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        if (Thumb || CurInstr.Cond() == 0xE)
+        {
+            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsLoaded)
+                RegCache.UnloadRegister(reg);
+        }
+        else
+        {
+            BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsDirty)
+                SaveReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+
+        for (int reg : hiRegsLoaded)
+            LoadReg(reg, RegCache.Mapping[reg]);
+    }
+}
+
 Compiler::Compiler()
 {
 #ifdef __SWITCH__
@@ -80,8 +210,7 @@ Compiler::Compiler()
     assert(succeded);
 
     SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
-    JitMemUseableSize = JitMemSize;
-    Reset();
+    JitMemMainSize = JitMemSize;
 #else
     u64 pageSize = sysconf(_SC_PAGE_SIZE);
     u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
@@ -90,31 +219,8 @@ Compiler::Compiler()
 
     SetCodeBase(pageAligned, pageAligned);
     JitMemUseableSize = alignedSize;
-    Reset();
 #endif
-
-    for (int i = 0; i < 3; i++)
-    {
-        for (int j = 0; j < 2; j++)
-        {
-            MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
-        }
-    }
-    MemFunc7[0][0] = (void*)NDS::ARM7Read8;
-    MemFunc7[1][0] = (void*)NDS::ARM7Read16;
-    MemFunc7[2][0] = (void*)NDS::ARM7Read32;
-    MemFunc7[0][1] = (void*)NDS::ARM7Write8;
-    MemFunc7[1][1] = (void*)NDS::ARM7Write16;
-    MemFunc7[2][1] = (void*)NDS::ARM7Write32;
-
-    for (int i = 0; i < 2; i++)
-    {
-        for (int j = 0; j < 2; j++)
-        {
-            MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
-            MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
-        }
-    }
+    SetCodePtr(0);
 
     for (int i = 0; i < 3; i++)
     {
@@ -123,26 +229,26 @@ Compiler::Compiler()
     }
 
     /*
-        W0 - mode
+        W5 - mode
         W1 - reg num
         W3 - in/out value of reg
     */
     {
         ReadBanked = GetRXPtr();
 
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
-        CMP(W0, 0x11);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
         FixupBranch fiq = B(CC_EQ);
         SUBS(W1, W1, 13 - 8);
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
         FixupBranch notEverything = B(CC_LT);
-        CMP(W0, 0x12);
+        CMP(W5, 0x12);
         FixupBranch irq = B(CC_EQ);
-        CMP(W0, 0x13);
+        CMP(W5, 0x13);
         FixupBranch svc = B(CC_EQ);
-        CMP(W0, 0x17);
+        CMP(W5, 0x17);
         FixupBranch abt = B(CC_EQ);
-        CMP(W0, 0x1B);
+        CMP(W5, 0x1B);
         FixupBranch und = B(CC_EQ);
         SetJumpTarget(notEverything);
         RET();
@@ -166,19 +272,19 @@ Compiler::Compiler()
     {
         WriteBanked = GetRXPtr();
 
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
-        CMP(W0, 0x11);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
         FixupBranch fiq = B(CC_EQ);
         SUBS(W1, W1, 13 - 8);
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
         FixupBranch notEverything = B(CC_LT);
-        CMP(W0, 0x12);
+        CMP(W5, 0x12);
         FixupBranch irq = B(CC_EQ);
-        CMP(W0, 0x13);
+        CMP(W5, 0x13);
         FixupBranch svc = B(CC_EQ);
-        CMP(W0, 0x17);
+        CMP(W5, 0x17);
         FixupBranch abt = B(CC_EQ);
-        CMP(W0, 0x1B);
+        CMP(W5, 0x1B);
         FixupBranch und = B(CC_EQ);
         SetJumpTarget(notEverything);
         MOVI2R(W4, 0);
@@ -206,9 +312,71 @@ Compiler::Compiler()
         RET();
     }
 
-    //FlushIcache();
+    for (int num = 0; num < 2; num++)
+    {
+        for (int size = 0; size < 3; size++)
+        {
+            for (int reg = 0; reg < 8; reg++)
+            {
+                ARM64Reg rdMapped = (ARM64Reg)(W19 + reg);
+                PatchedStoreFuncs[num][size][reg] = GetRXPtr();
+                if (num == 0)
+                {
+                    MOV(X1, RCPU);
+                    MOV(W2, rdMapped);
+                }
+                else
+                {
+                    MOV(W1, rdMapped);
+                }
+                ABI_PushRegisters({30});
+                switch ((8 << size) |  num)
+                {
+                case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                case 33: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                case 17: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                case 9: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                }
+                ABI_PopRegisters({30});
+                RET();
+
+                for (int signextend = 0; signextend < 2; signextend++)
+                {
+                    PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr();
+                    if (num == 0)
+                        MOV(X1, RCPU);
+                    ABI_PushRegisters({30});
+                    switch ((8 << size) |  num)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 33: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 17: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    case 9: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                    ABI_PopRegisters({30});
+                    if (size == 32)
+                        MOV(rdMapped, W0);
+                    else if (signextend)
+                        SBFX(rdMapped, W0, 0, 8 << size);
+                    else
+                        UBFX(rdMapped, W0, 0, 8 << size);
+                    RET();
+                }
+            }
+        }
+    }
+
+    FlushIcache();
+
+    JitMemSecondarySize = 1024*1024*4;
+
+    JitMemMainSize -= GetCodeOffset();
+    JitMemMainSize -= JitMemSecondarySize;
 
-    JitMemUseableSize -= GetCodeOffset();
     SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
 }
 
@@ -227,6 +395,16 @@ Compiler::~Compiler()
 #endif
 }
 
+void Compiler::LoadCycles()
+{
+    LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
+void Compiler::SaveCycles()
+{
+    STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
 void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
 {
     if (reg == 15)
@@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // CMN
     F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
     // Mul
-    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, 
+    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short),
     // ARMv5 exclusives
     F(Clz), NULL, NULL, NULL, NULL, 
     
@@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL,
     &Compiler::Nop
 };
 #undef F
@@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind)
     return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
 }
 
-void Compiler::Comp_BranchSpecialBehaviour()
+void Compiler::Comp_BranchSpecialBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
     {
         MOVI2R(W0, 1);
         STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
     }
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
-        SaveCPSR(false);
         RegCache.PrepareExit();
-        ADD(W0, RCycles, ConstantCycles);
-        ABI_PopRegisters(SavedRegs);
-        RET();
+
+        SUB(RCycles, RCycles, ConstantCycles);
+        QuickTailCall(X0, ARM_Ret);
     }
 }
 
 JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
-    if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+    if (JitMemMainSize - GetCodeOffset() < 1024 * 16)
+    {
+        printf("JIT near memory full, resetting...\n");
+        ResetBlockCache();
+    }
+    if ((JitMemMainSize +  JitMemSecondarySize) - OtherCodeRegion < 1024 * 8)
     {
-        printf("JIT memory full, resetting...\n");
+        printf("JIT far memory full, resetting...\n");
         ResetBlockCache();
     }
 
@@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     CurCPU = cpu;
     ConstantCycles = 0;
     RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
-
-    //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
-    const u32 ALL_CALLEE_SAVED = 0x7FF80000;
-
-    SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
-
-    //if (Num == 1)
-    {
-        ABI_PushRegisters(SavedRegs);
-
-        MOVP2R(RCPU, CurCPU);
-        MOVI2R(RCycles, 0);
-
-        LoadCPSR();
-    }
+    CPSRDirty = false;
 
     for (int i = 0; i < instrsCount; i++)
     {
@@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
         if (comp == NULL)
         {
+            SaveCycles();
             SaveCPSR();
             RegCache.Flush();
         }
@@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                     (this->*comp)();
                 }
 
-                Comp_BranchSpecialBehaviour();
+                Comp_BranchSpecialBehaviour(true);
 
                 if (cond < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipNop = B();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C();
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            SaveCPSR(false);
-                            RegCache.PrepareExit();
-                            ADD(W0, RCycles, ConstantCycles);
-                            ABI_PopRegisters(SavedRegs);
-                            RET();
-                        }
+                        Comp_BranchSpecialBehaviour(false);
 
                         SetJumpTarget(skipNop);
                     }
@@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
         }
 
         if (comp == NULL)
+        {
+            LoadCycles();
             LoadCPSR();
+        }
     }
 
     RegCache.Flush();
 
-    //if (Num == 1)
-    {
-        SaveCPSR();
-
-        ADD(W0, RCycles, ConstantCycles);
-
-        ABI_PopRegisters(SavedRegs);
-    }
-    //else
-    //    ADD(RCycles, RCycles, ConstantCycles);
-
-    RET();
+    SUB(RCycles, RCycles, ConstantCycles);
+    QuickTailCall(X0, ARM_Ret);
 
     FlushIcache();
 
-    //printf("finished\n");
-
     return res;
 }
 
 void Compiler::Reset()
 {
+    LoadStorePatches.clear();
+
     SetCodePtr(0);
+    OtherCodeRegion = JitMemMainSize;
 
     const u32 brk_0 = 0xD4200000;
 
-    for (int i = 0; i < JitMemUseableSize / 4; i++)
+    for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++)
         *(((u32*)GetRWPtr()) + i) = brk_0;
 }
 
-void Compiler::Comp_AddCycles_C(bool nonConst)
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (!nonConst && !CurInstr.Info.Branches())
+    if (forceNonConstant)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CI(u32 numI)
 {
+    IrregularCycles = true;
+
     s32 cycles = (Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
 
-    if (Thumb || CurInstr.Cond() >= 0xE)
+    if (Thumb || CurInstr.Cond() == 0xE)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
 {
+    IrregularCycles = true;
+
     s32 cycles = (Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
 
-    ADD(RCycles, RCycles, numI, shift);
+    SUB(RCycles, RCycles, cycles);
     if (Thumb || CurInstr.Cond() >= 0xE)
-        ConstantCycles += c;
+        ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CDI()
@@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(RCycles, RCycles, cycles);
+            SUB(RCycles, RCycles, cycles);
         else
             ConstantCycles += cycles;
     }
@@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 5c9ef41..e4ffc63 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -9,6 +9,8 @@
 #include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
+#include <unordered_map>
+
 namespace ARMJIT
 {
 
@@ -64,7 +66,14 @@ struct Op2
     };
 };
 
-class Compiler : Arm64Gen::ARM64XEmitter
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s32 PatchOffset;
+    u32 PatchSize;
+};
+
+class Compiler : public Arm64Gen::ARM64XEmitter
 {
 public:
     typedef void (Compiler::*CompileFunc)();
@@ -72,6 +81,9 @@ public:
     Compiler();
     ~Compiler();
 
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
     Arm64Gen::ARM64Reg MapReg(int reg)
     {
         assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
@@ -89,7 +101,7 @@ public:
 
     void Reset();
 
-    void Comp_AddCycles_C(bool forceNonConst = false);
+    void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 numI);
     void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
     void Comp_AddCycles_CD();
@@ -103,6 +115,9 @@ public:
     void LoadCPSR();
     void SaveCPSR(bool markClean = true);
 
+    void LoadCycles();
+    void SaveCycles();
+
     void Nop() {}
 
     void A_Comp_ALUTriOp();
@@ -111,6 +126,7 @@ public:
 
     void A_Comp_Mul();
     void A_Comp_Mul_Long();
+    void A_Comp_Mul_Short();
 
     void A_Comp_Clz();
 
@@ -122,6 +138,8 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+    void A_Comp_MRS();
+    void A_Comp_MSR();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -168,7 +186,7 @@ public:
     void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
     void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
 
-    void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+    bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
     enum
     {
         memop_Writeback = 1 << 0,
@@ -179,16 +197,33 @@ public:
     };
     void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
 
-    void* Gen_MemoryRoutine9(int size, bool store);
-
-    void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
-    void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
-
     // 0 = switch mode, 1 = stay arm, 2 = stay thumb
     void* Gen_JumpTo9(int kind);
     void* Gen_JumpTo7(int kind);
 
-    void Comp_BranchSpecialBehaviour();
+    void Comp_BranchSpecialBehaviour(bool taken);
+
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(GetRXBase() + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - GetRXBase();
+    }
+
+    bool IsJITFault(u64 pc);
+    s64 RewriteMemAccess(u64 pc);
+
+    void SwapCodeRegion()
+    {
+        ptrdiff_t offset = GetCodeOffset();
+        SetCodePtrUnsafe(OtherCodeRegion);
+        OtherCodeRegion = offset;
+    }
+
+    ptrdiff_t OtherCodeRegion;
 
     bool Exit;
 
@@ -202,22 +237,20 @@ public:
 
     BitSet32 SavedRegs;
 
-    u32 JitMemUseableSize;
+    u32 JitMemSecondarySize;
+    u32 JitMemMainSize;
 
     void* ReadBanked, *WriteBanked;
 
-    // [size][store]
-    void* MemFunc9[3][2];
-    void* MemFunc7[3][2];
-
-    // [store][pre increment]
-    void* MemFuncsSeq9[2][2];
-    // "[code in main ram]
-    void* MemFuncsSeq7[2][2];
-
     void* JumpToFuncs9[3];
     void* JumpToFuncs7[3];
 
+    std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches; 
+
+    // [Num][Size][Sign Extend][Output register]
+    void* PatchedLoadFuncs[2][3][2][8];
+    void* PatchedStoreFuncs[2][3][8];
+
     RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
 
     bool CPSRDirty = false;
diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..536a478
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Linkage.s
@@ -0,0 +1,68 @@
+#include "../ARMJIT_x64/ARMJIT_Offsets.h"
+
+.text
+
+#define RCPSR W27
+#define RCycles W28
+#define RCPU X29
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+    stp x19, x20, [sp, #-96]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+    stp x25, x26, [sp, #48]
+    stp x27, x28, [sp, #64]
+    stp x29, x30, [sp, #80]
+
+    mov RCPU, x0
+    ldr RCycles, [RCPU, ARM_Cycles_offset]
+    ldr RCPSR, [RCPU, ARM_CPSR_offset]
+
+    br x1
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    str RCycles, [RCPU, ARM_Cycles_offset]
+    str RCPSR, [RCPU, ARM_CPSR_offset]
+
+    ldp x29, x30, [sp, #80]
+    ldp x27, x28, [sp, #64]
+    ldp x25, x26, [sp, #48]
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #96
+
+    ret
+
+.p2align 4,,15
+
+.global ARM_RestoreContext
+ARM_RestoreContext:
+    mov sp, x0
+
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
+    ldp x8, x9, [sp, #64]
+    ldp x10, x11, [sp, #80]
+    ldp x12, x13, [sp, #96]
+    ldp x14, x15, [sp, #112]
+    ldp x16, x17, [sp, #128]
+    ldp x18, x19, [sp, #144]
+    ldp x20, x21, [sp, #160]
+    ldp x22, x23, [sp, #176]
+    ldp x24, x25, [sp, #192]
+    ldp x26, x27, [sp, #208]
+    ldp x28, x29, [sp, #224]
+    ldr x30, [sp, #240]
+
+    ldp x17, x18, [sp, #248]
+    mov sp, x17
+
+    br x18
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index 6cf710b..b307d0e 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -2,286 +2,62 @@
 
 #include "../Config.h"
 
+#include "../ARMJIT_Memory.h"
+
 using namespace Arm64Gen;
 
 namespace ARMJIT
 {
 
-// W0 - address
-// (if store) W1 - value to store
-// W2 - code cycles
-void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+bool Compiler::IsJITFault(u64 pc)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
-
-    u32 addressMask;
-    switch (size)
-    {
-    case 32: addressMask = ~3; break;
-    case 16: addressMask = ~1; break;
-    case 8:  addressMask = ~0; break;
-    }
-
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
-    SUB(W3, W0, W3);
-    CMP(W3, W4);
-    FixupBranch insideDTCM = B(CC_LO);
-
-    UBFX(W4, W0, 24, 8);
-    CMP(W4, 0x02);
-    FixupBranch outsideMainRAM = B(CC_NEQ);
-    ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
-    MOVP2R(X4, NDS::MainRAM);
-    if (!store && size == 32)
-    {
-        LDR(W3, X3, X4);
-        ANDI2R(W0, W0, 3);
-        LSL(W0, W0, 3);
-        RORV(W0, W3, W0);
-    }
-    else if (store)
-        STRGeneric(size, W1, X3, X4);
-    else
-        LDRGeneric(size, false, W0, X3, X4);
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
-    CMP(W0, W3);
-    FixupBranch insideITCM = B(CC_LO);
-
-    if (store)
-    {
-        if (size > 8)
-            ANDI2R(W0, W0, addressMask);
-
-        switch (size)
-        {
-        case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
-        case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
-        case 8:  QuickTailCall(X4, NDS::ARM9Write8);  break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-            ABI_PushRegisters({0, 30});
-        if (size > 8)
-            ANDI2R(W0, W0, addressMask);
-
-        switch (size)
-        {
-        case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
-        case 16: QuickTailCall    (X4, NDS::ARM9Read16); break;
-        case 8:  QuickTailCall    (X4, NDS::ARM9Read8 ); break;
-        }
-        if (size == 32)
-        {
-            ABI_PopRegisters({1, 30});
-            ANDI2R(W1, W1, 3);
-            LSL(W1, W1, 3);
-            RORV(W0, W0, W1);
-            RET();
-        }
-    }
-
-    SetJumpTarget(insideDTCM);
-    ANDI2R(W3, W3, 0x3FFF & addressMask);
-    ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
-    if (!store && size == 32)
-    {
-        ANDI2R(W4, W0, 3);
-        LDR(W0, RCPU, W3);
-        LSL(W4, W4, 3);
-        RORV(W0, W0, W4);
-    }
-    else if (store)
-        STRGeneric(size, W1, RCPU, W3);
-    else
-        LDRGeneric(size, false, W0, RCPU, W3);
-    
-    RET();
-
-    SetJumpTarget(insideITCM);
-    ANDI2R(W3, W0, 0x7FFF & addressMask);
-    if (store)
-    {
-        ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4);
-        LSR(W5, W0, 9);
-        MOVP2R(X4, CodeRanges);
-        ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4));
-        static_assert(sizeof(AddressRange) == 16);
-        LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
-        FixupBranch null = CBZ(W4);
-        ABI_PushRegisters({1, 3, 30});
-        QuickCallFunction(X4, InvalidateByAddr);
-        ABI_PopRegisters({1, 3, 30});
-        SetJumpTarget(null);
-    }
-    ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
-    if (!store && size == 32)
-    {
-        ANDI2R(W4, W0, 3);
-        LDR(W0, RCPU, W3);
-        LSL(W4, W4, 3);
-        RORV(W0, W0, W4);
-    }
-    else if (store)
-        STRGeneric(size, W1, RCPU, W3);
-    else
-        LDRGeneric(size, false, W0, RCPU, W3);
-    RET();
-
-    return res;
+    return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize);
 }
 
-/*
-    W0 - base address
-    X1 - stack space
-    W2 - values count
-*/
-void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+s64 Compiler::RewriteMemAccess(u64 pc)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
-    
-    void* loopStart = GetRXPtr();
-    SUB(W2, W2, 1);
-
-    if (preinc)
-        ADD(W0, W0, 4);
+    ptrdiff_t pcOffset = pc - (u64)GetRXBase();
 
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
-    LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
-    SUB(W4, W0, W4);
-    CMP(W4, W5);
-    FixupBranch insideDTCM = B(CC_LO);
+    auto it = LoadStorePatches.find(pcOffset);
 
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
-    CMP(W0, W4);
-    FixupBranch insideITCM = B(CC_LO);
-
-    ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
-    if (store)
+    if (it != LoadStorePatches.end())
     {
-        LDR(X1, X1, ArithOption(X2, true));
-        QuickCallFunction(X4, NDS::ARM9Write32);
+        LoadStorePatch patch = it->second;
 
-        ABI_PopRegisters({0, 1, 2, 30});
-    }
-    else
-    {
-        QuickCallFunction(X4, NDS::ARM9Read32);
-        MOV(W4, W0);
+        ptrdiff_t curCodeOffset = GetCodeOffset();
 
-        ABI_PopRegisters({0, 1, 2, 30});
+        SetCodePtrUnsafe(pcOffset + patch.PatchOffset);
 
-        STR(X4, X1, ArithOption(X2, true));
-    }
+        BL(patch.PatchFunc);
 
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
+        for (int i = 0; i < patch.PatchSize / 4 - 1; i++)
+            HINT(HINT_NOP);
 
-    SetJumpTarget(insideDTCM);
+        FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr());
 
-    ANDI2R(W4, W4, ~3 & 0x3FFF);
-    ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X4);
-    }
-    else
-    {
-        LDR(W5, RCPU, X4);
-        STR(X5, X1, ArithOption(X2, true));
-    }
+        SetCodePtrUnsafe(curCodeOffset);
 
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
-
-    SetJumpTarget(insideITCM);
-
-    ANDI2R(W4, W0, ~3 & 0x7FFF);
-
-    ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5);
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X6);
-    }
-    else
-    {
-        LDR(W5, RCPU, X6);
-        STR(X5, X1, ArithOption(X2, true));
-    }
+        LoadStorePatches.erase(it);
 
-    if (store)
-    {
-        ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5);
-        LSR(W6, W4, 9);
-        MOVP2R(X5, CodeRanges);
-        ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
-        static_assert(sizeof(AddressRange) == 16);
-        LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
-        FixupBranch null = CBZ(W5);
-        ABI_PushRegisters({0, 1, 2, 4, 30});
-        MOV(W0, W4);
-        QuickCallFunction(X5, InvalidateByAddr);
-        ABI_PopRegisters({0, 1, 2, 4, 30});
-        SetJumpTarget(null);
+        return patch.PatchOffset;
     }
-
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
-    return res;
+    printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc));
+    assert(false);
 }
 
-void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
+    u32 localAddr = LocaliseCodeAddress(Num, addr);
 
-    void* loopStart = GetRXPtr();
-    SUB(W2, W2, 1);
-
-    if (preinc)
-        ADD(W0, W0, 4);
-
-    ABI_PushRegisters({0, 1, 2, 30});
-    if (store)
+    int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
+    if (invalidLiteralIdx != -1)
     {
-        LDR(X1, X1, ArithOption(X2, true));
-        QuickCallFunction(X4, NDS::ARM7Write32);
-        ABI_PopRegisters({0, 1, 2, 30});
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
     }
-    else
-    {
-        QuickCallFunction(X4, NDS::ARM7Read32);
-        MOV(W4, W0);
-        ABI_PopRegisters({0, 1, 2, 30});
-        STR(X4, X1, ArithOption(X2, true));
-    }
-
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
 
-    return res;
-}
+    Comp_AddCycles_CDI();
 
-void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
-{
     u32 val;
     // make sure arm7 bios is accessible
     u32 tmpR15 = CurCPU->R[15];
@@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
 
     if (Thumb || CurInstr.Cond() == 0xE)
         RegCache.PutLiteral(rd, val);
+    
+    return true;
 }
 
 void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
@@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
         addressMask = ~3;
     if (size == 16)
         addressMask = ~1;
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        
+        if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
+            return;
+    }
     
     if (flags & memop_Store)
         Comp_AddCycles_CD();
     else
         Comp_AddCycles_CDI();
 
-    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
-    {
-        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rnMapped = MapReg(rn);
 
-        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
-        {
-            Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
-            return;
-        }
+    if (Thumb && rn == 15)
+    {
+        ANDI2R(W3, rnMapped, ~2);
+        rnMapped = W3;
     }
 
+    ARM64Reg finalAddr = W0;
+    if (flags & memop_Post)
     {
-        ARM64Reg rdMapped = MapReg(rd);
-        ARM64Reg rnMapped = MapReg(rn);
-
-        bool inlinePreparation = Num == 1;
-        u32 constLocalROR32 = 4;
+        finalAddr = rnMapped;
+        MOV(W0, rnMapped);
+    }
 
-        void* memFunc = Num == 0
-            ? MemFunc9[size >> 4][!!(flags & memop_Store)]
-            : MemFunc7[size >> 4][!!((flags & memop_Store))];
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
-        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+    if (!offset.IsImm)
+        Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+    // offset might has become an immediate
+    if (offset.IsImm)
+    {
+        if (offset.Imm)
+        {
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Imm);
+            else
+                ADD(finalAddr, rnMapped, offset.Imm);
+        }
+        else if (finalAddr != rnMapped)
+            MOV(finalAddr, rnMapped);
+    }
+    else
+    {
+        if (offset.Reg.ShiftType == ST_ROR)
         {
-            u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+            ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+            offset = Op2(W0);
+        }
 
-            NDS::MemRegion region;
-            region.Mem = NULL;
-            if (Num == 0)
-            {
-                ARMv5* cpu5 = (ARMv5*)CurCPU;
+        if (flags & memop_SubtractOffset)
+            SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+        else
+            ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+    }
 
-                // stupid dtcm...
-                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
-                {
-                    region.Mem = cpu5->DTCM;
-                    region.Mask = 0x3FFF;
-                }
-                else
-                {
-                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
-                }
-            }
-            else
-                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+    if (!(flags & memop_Post) && (flags & memop_Writeback))
+        MOV(rnMapped, W0);
 
-            if (region.Mem != NULL)
-            {
-                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
 
-                MOVP2R(X0, ptr);
-                if (flags & memop_Store)
-                    STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
-                else
-                {
-                    LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
-                    if (size == 32 && addr & ~0x3)
-                        ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
-                }
-                return;
-            }
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget)))
+    {
+        ptrdiff_t memopStart = GetCodeOffset();
+        LoadStorePatch patch;
 
-            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
-            if (specialFunc)
-            {
-                memFunc = specialFunc;
-                inlinePreparation = true;
-                constLocalROR32 = addr & 0x3;
-            }
-        }
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19]
+            : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19];
+        assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8);
 
-        ARM64Reg finalAddr = W0;
-        if (flags & memop_Post)
-        {
-            finalAddr = rnMapped;
-            MOV(W0, rnMapped);
-        }
+        MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
 
+        // take a chance at fastmem
+        if (size > 8)
+            ANDI2R(W1, W0, addressMask);
+        
+        ptrdiff_t loadStorePosition = GetCodeOffset();
         if (flags & memop_Store)
-            MOV(W1, rdMapped);
-
-        if (!offset.IsImm)
-            Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
-        // offset might become an immediate
-        if (offset.IsImm)
         {
-            if (flags & memop_SubtractOffset)
-                SUB(finalAddr, rnMapped, offset.Imm);
-            else
-                ADD(finalAddr, rnMapped, offset.Imm);
+            STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7);
         }
         else
         {
-            if (offset.Reg.ShiftType == ST_ROR)
+            LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7);
+            if (size == 32)
             {
-                ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
-                offset = Op2(W0);
+                UBFIZ(W0, W0, 3, 2);
+                RORV(rdMapped, rdMapped, W0);
             }
-
-            if (flags & memop_SubtractOffset)
-                SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
-            else
-                ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
         }
 
-        if (!(flags & memop_Post) && (flags & memop_Writeback))
-            MOV(rnMapped, W0);
+        patch.PatchOffset = memopStart - loadStorePosition;
+        patch.PatchSize = GetCodeOffset() - memopStart;
+        LoadStorePatches[loadStorePosition] = patch;
+    }
+    else
+    {
+        void* func = NULL;
+        if (addrIsStatic)
+            func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size);
 
-        if (inlinePreparation)
+        if (func)
         {
-            if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
-                ANDI2R(rdMapped, W0, 3);
-            if (size > 8)
-                ANDI2R(W0, W0, addressMask);
+            if (flags & memop_Store)
+                MOV(W1, rdMapped);
+            QuickCallFunction(X2, (void (*)())func);
+
+            if (!(flags & memop_Store))
+            {
+                if (size == 32)
+                {
+                    if (staticAddress & 0x3)
+                        ROR_(rdMapped, W0, (staticAddress & 0x3) << 3);
+                    else
+                        MOV(rdMapped, W0);
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        SBFX(rdMapped, W0, 0, size);
+                    else
+                        UBFX(rdMapped, W0, 0, size);
+                }
+            }
         }
-        QuickCallFunction(X2, memFunc);
-        if (!(flags & memop_Store))
+        else
         {
-            if (inlinePreparation && !(flags & memop_Store) && size == 32)
+            if (Num == 0)
             {
-                if (constLocalROR32 == 4)
+                MOV(X1, RCPU);
+                if (flags & memop_Store)
                 {
-                    LSL(rdMapped, rdMapped, 3);
-                    RORV(rdMapped, W0, rdMapped);
+                    MOV(W2, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                    }
                 }
-                else if (constLocalROR32 > 0)
-                    ROR_(rdMapped, W0, constLocalROR32 << 3);
                 else
-                    MOV(rdMapped, W0);
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    }
+                }
             }
-            else if (flags & memop_SignExtend)
+            else
             {
-                if (size == 16)
-                    SXTH(rdMapped, W0);
-                else if (size == 8)
-                    SXTB(rdMapped, W0);
+                if (flags & memop_Store)
+                {
+                    MOV(W1, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                    }
+                }
                 else
-                    assert("What's wrong with you?");
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                }
             }
-            else
-                MOV(rdMapped, W0);
-            
-            if (CurInstr.Info.Branches())
+
+            if (!(flags & memop_Store))
             {
-                if (size < 32)
-                    printf("LDR size < 32 branching?\n");
-                Comp_JumpTo(rdMapped, Num == 0, false);
+                if (size == 32)
+                    MOV(rdMapped, W0);
+                else if (flags & memop_SignExtend)
+                    SBFX(rdMapped, W0, 0, size);
+                else
+                    UBFX(rdMapped, W0, 0, size);
             }
         }
     }
+
+    if (CurInstr.Info.Branches())
+    {
+        if (size < 32)
+            printf("LDR size < 32 branching?\n");
+        Comp_JumpTo(rdMapped, Num == 0, false);
+    }
 }
 
 void Compiler::A_Comp_MemWB()
@@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+    u32 offset = ((CurInstr.Instr & 0xFF) << 2);
+    u32 addr = (R15 & ~0x2) + offset;
 
-    if (Config::JIT_LiteralOptimisations)
-    {
-        Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
-        Comp_AddCycles_CDI();
-    }
-    else
-    {
-        bool negative = addr < R15;
-        u32 abs = negative ? R15 - addr : addr - R15;
-        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
-    }
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
@@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     if (regsCount == 0)
         return 0; // actually not the right behaviour TODO: fix me
 
-    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
-    if (store)
+    if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
     {
+        int flags = 0;
+        if (store)
+            flags |= memop_Store;
+        if (decrement)
+            flags |= memop_SubtractOffset;
+        Op2 offset = preinc ? Op2(4) : Op2(0);
+
+        Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+        return decrement ? -4 : 4;
+    }
+
+    if (store)
         Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
 
-        if (usermode && (regs & BitSet16(0x7f00)))
-            UBFX(W0, RCPSR, 0, 5);
+    int expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+    bool compileFastPath = Config::JIT_FastMemory
+        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget));
+
+    if (decrement)
+    {
+        SUB(W0, MapReg(rn), regsCount * 4);
+        ANDI2R(W0, W0, ~3);
+        preinc ^= true;
+    }
+    else
+    {
+        ANDI2R(W0, MapReg(rn), ~3);
+    }
+
+    LoadStorePatch patch;
+    if (compileFastPath)
+    {
+        ptrdiff_t fastPathStart = GetCodeOffset();
+        ptrdiff_t firstLoadStoreOffset;
+
+        bool firstLoadStore = true;
+
+        MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+        ADD(X1, X1, X0);
+
+        u32 offset = preinc ? 4 : 0;
+        BitSet16::Iterator it = regs.begin();
+
+        if (regsCount & 1)
+        {
+            int reg = *it;
+            it++;
+
+            ARM64Reg first = W3;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STR(INDEX_UNSIGNED, first, X1, offset);
+            else
+                LDR(INDEX_UNSIGNED, first, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+
+            offset += 4;
+        }
+
+        while (it != regs.end())
+        {
+            int reg = *it;
+            it++;
+            int nextReg = *it;
+            it++;
 
-        int i = regsCount - 1;
+            ARM64Reg first = W3, second = W4;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+            if (RegCache.LoadedRegs & (1 << nextReg))
+                second = MapReg(nextReg);
+            else if (store)
+                LoadReg(nextReg, second);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STP(INDEX_SIGNED, first, second, X1, offset);
+            else
+                LDP(INDEX_SIGNED, first, second, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+            if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store)
+                SaveReg(nextReg, second);
+
+            offset += 8;
+        }
+
+        patch.PatchSize = GetCodeOffset() - fastPathStart;
+        patch.PatchOffset = fastPathStart - firstLoadStoreOffset;
+        SwapCodeRegion();
+        patch.PatchFunc = GetRXPtr();
+
+        LoadStorePatches[firstLoadStoreOffset] = patch;
+
+        ABI_PushRegisters({30});
+    }
+
+    int i = 0;
+
+    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+    if (store)
+    {
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W5, RCPSR, 0, 5);
 
         BitSet16::Iterator it = regs.begin();
         while (it != regs.end())
@@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             if (usermode && reg >= 8 && reg < 15)
             {
-                if (RegCache.Mapping[reg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << reg))
                     MOV(W3, MapReg(reg));
                 else
                     LoadReg(reg, W3);
@@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
             else if (!usermode && nextReg != regs.end())
             {
-                ARM64Reg first = W3;
-                ARM64Reg second = W4;
+                ARM64Reg first = W3, second = W4;
 
-                if (RegCache.Mapping[reg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << reg))
                     first = MapReg(reg);
                 else
                     LoadReg(reg, W3);
 
-                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << *nextReg))
                     second = MapReg(*nextReg);
                 else
                     LoadReg(*nextReg, W4);
 
-                STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+                STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
 
-                i--;
+                i++;
                 it++;
             }
-            else if (RegCache.Mapping[reg] != INVALID_REG)
+            else if (RegCache.LoadedRegs & (1 << reg))
+            {
                 STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+            }
             else
             {
                 LoadReg(reg, W3);
                 STR(INDEX_UNSIGNED, W3, SP, i * 8);
             }
-            i--;
+            i++;
             it++;
         }
     }
-    if (decrement)
-    {
-        SUB(W0, MapReg(rn), regsCount * 4);
-        preinc ^= true;
-    }
-    else
-        MOV(W0, MapReg(rn));
+
     ADD(X1, SP, 0);
     MOVI2R(W2, regsCount);
 
-    BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+    if (Num == 0)
+    {
+        MOV(X3, RCPU);
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break;
+        }
+    }
+    else
+    {
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break;
+        }
+    }
 
     if (!store)
     {
-        Comp_AddCycles_CDI();
-
         if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
-            UBFX(W0, RCPSR, 0, 5);
+            UBFX(W5, RCPSR, 0, 5);
 
-        int i = regsCount - 1;
         BitSet16::Iterator it = regs.begin();
         while (it != regs.end())
         {
@@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 MOVI2R(W1, reg - 8);
                 BL(WriteBanked);
                 FixupBranch alreadyWritten = CBNZ(W4);
-                if (RegCache.Mapping[reg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << reg))
                     MOV(MapReg(reg), W3);
-                    RegCache.DirtyRegs |= 1 << reg;
-                }
                 else
                     SaveReg(reg, W3);
                 SetJumpTarget(alreadyWritten);
@@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             {
                 ARM64Reg first = W3, second = W4;
                 
-                if (RegCache.Mapping[reg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << reg))
                     first = MapReg(reg);
-                    if (reg != 15)
-                        RegCache.DirtyRegs |= 1 << reg;
-                }
-                if (RegCache.Mapping[*nextReg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << *nextReg))
                     second = MapReg(*nextReg);
-                    if (*nextReg != 15)
-                        RegCache.DirtyRegs |= 1 << *nextReg;
-                }
 
-                LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+                LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
 
                 if (first == W3)
                     SaveReg(reg, W3);
@@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     SaveReg(*nextReg, W4);
 
                 it++;
-                i--;
+                i++;
             }
-            else if (RegCache.Mapping[reg] != INVALID_REG)
+            else if (RegCache.LoadedRegs & (1 << reg))
             {
                 ARM64Reg mapped = MapReg(reg);
                 LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
-
-                if (reg != 15)
-                    RegCache.DirtyRegs |= 1 << reg;
             }
             else
             {
@@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
 
             it++;
-            i--;
+            i++;
         }
     }
     ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
 
+    if (compileFastPath)
+    {
+        ABI_PopRegisters({30});
+        RET();
+
+        FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr());
+        SwapCodeRegion();
+    }
+
     if (!store && regs[15])
     {
         ARM64Reg mapped = MapReg(15);
diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h
new file mode 100644
index 0000000..513c103
--- /dev/null
+++ b/src/ARMJIT_Compiler.h
@@ -0,0 +1,12 @@
+#if defined(__x86_64__)
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+#elif defined(__aarch64__)
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#else
+#error "The current target platform doesn't have a JIT backend"
+#endif
+
+namespace ARMJIT
+{
+extern Compiler* JITCompiler;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 4e45760..19684c4 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -3,8 +3,11 @@
 
 #include "types.h"
 #include <stdint.h>
+#include <string.h>
+#include <assert.h>
 
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 // here lands everything which doesn't fit into ARMJIT.h
 // where it would be included by pretty much everything
@@ -160,8 +163,8 @@ public:
 		Data.SetLength(numAddresses * 2 + numLiterals);
 	}
 
-	u32 PseudoPhysicalAddr;
-
+	u32 StartAddr;
+	u32 StartAddrLocal;
 	u32 InstrHash, LiteralHash;
 	u8 Num;
 	u16 NumAddresses;
@@ -175,28 +178,8 @@ public:
 	{ return &Data[NumAddresses]; }
 	u32* Literals()
 	{ return &Data[NumAddresses * 2]; }
-	u32* Links()
-	{ return &Data[NumAddresses * 2 + NumLiterals]; }
-
-	u32 NumLinks()
-	{ return Data.Length - NumAddresses * 2 - NumLiterals; }
-
-	void AddLink(u32 link)
-	{
-		Data.Add(link);
-	}
-
-	void ResetLinks()
-	{
-		Data.SetLength(NumAddresses * 2 + NumLiterals);
-	}
 
 private:
-	/*
-		0..<NumInstrs - the instructions of the block
-		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
-			(atleast one, the pseudo physical address of the block)
-	*/
 	TinyVector<u32> Data;
 };
 
@@ -207,45 +190,32 @@ struct __attribute__((packed)) AddressRange
 	u32 Code;
 };
 
-extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
-extern u8 MemoryStatus9[0x800000];
-extern u8 MemoryStatus7[0x800000];
-
 extern TinyVector<u32> InvalidLiterals;
 
-void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
-
-template <u32 Num>
-void LinkBlock(ARM* cpu, u32 codeOffset);
+extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count];
 
-enum
+inline bool PageContainsCode(AddressRange* range)
 {
-	memregion_Other = 0,
-	memregion_ITCM,
-	memregion_DTCM,
-	memregion_BIOS9,
-	memregion_MainRAM,
-	memregion_SWRAM9,
-	memregion_SWRAM7,
-	memregion_IO9,
-	memregion_VRAM,
-	memregion_BIOS7,
-	memregion_WRAM7,
-	memregion_IO7,
-	memregion_Wifi,
-	memregion_VWRAM,
-};
+	for (int i = 0; i < 8; i++)
+	{
+		if (range[i].Blocks.Length > 0)
+			return true;
+	}
+	return false;
+}
+
+u32 LocaliseCodeAddress(u32 num, u32 addr);
 
-int ClassifyAddress9(u32 addr);
-int ClassifyAddress7(u32 addr);
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
 
-template <typename T> T SlowRead9(ARMv5* cpu, u32 addr);
-template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val);
+template <typename T> T SlowRead9(u32 addr, ARMv5* cpu);
+template <typename T> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
 template <typename T> T SlowRead7(u32 addr);
 template <typename T> void SlowWrite7(u32 addr, T val);
 
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
new file mode 100644
index 0000000..162827d
--- /dev/null
+++ b/src/ARMJIT_Memory.cpp
@@ -0,0 +1,822 @@
+#ifdef __SWITCH__
+#include "switch/compat_switch.h"
+#endif
+
+#include "ARMJIT_Memory.h"
+
+#include "ARMJIT_Internal.h"
+#include "ARMJIT_Compiler.h"
+
+#include "GPU.h"
+#include "GPU3D.h"
+#include "Wifi.h"
+#include "NDSCart.h"
+#include "SPU.h"
+
+#include <malloc.h>
+
+/*
+	We're handling fastmem here.
+
+	Basically we're repurposing a big piece of virtual memory
+	and map the memory regions as they're structured on the DS
+	in it.
+
+	On most systems you have a single piece of main ram, 
+	maybe some video ram and faster cache RAM and that's about it.
+	Here we have not only a lot more different memory regions,
+	but also two address spaces. Not only that but they all have
+	mirrors (the worst case is 16kb SWRAM which is mirrored 1024x).
+
+	We handle this by only mapping those regions which are actually
+	used and by praying the games don't go wild.
+
+	Beware, this file is full of platform specific code.
+
+*/
+
+namespace ARMJIT_Memory
+{
+#ifdef __aarch64__
+struct FaultDescription
+{
+	u64 IntegerRegisters[33];
+	u64 FaultAddr;
+
+	u32 GetEmulatedAddr()
+	{
+		// now this is podracing
+		return (u32)IntegerRegisters[0];
+	}
+	u64 RealAddr()
+	{
+		return FaultAddr;
+	}
+
+	u64 GetPC()
+	{
+		return IntegerRegisters[32];
+	}
+
+	void RestoreAndRepeat(s64 offset);
+};
+#else
+struct FaultDescription
+{
+	u64 GetPC()
+	{
+		return 0;
+	}
+	
+	u32 GetEmulatedAddr()
+	{
+		return 0;
+	}
+	u64 RealAddr()
+	{
+		return 0;
+	}
+
+	void RestoreAndRepeat(s64 offset);
+};
+#endif
+
+void FaultHandler(FaultDescription* faultDesc);
+}
+
+
+#ifdef __aarch64__
+
+extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
+
+#endif
+
+#ifdef __SWITCH__
+// with LTO the symbols seem to be not properly overriden
+// if they're somewhere else
+
+extern "C"
+{
+extern char __start__;
+extern char __rodata_start;
+
+alignas(16) u8 __nx_exception_stack[0x8000];
+u64 __nx_exception_stack_size = 0x8000;
+
+void __libnx_exception_handler(ThreadExceptionDump* ctx)
+{
+	ARMJIT_Memory::FaultDescription desc;
+	memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29);
+	desc.IntegerRegisters[29] = ctx->fp.x;
+	desc.IntegerRegisters[30] = ctx->lr.x;
+	desc.IntegerRegisters[31] = ctx->sp.x;
+	desc.IntegerRegisters[32] = ctx->pc.x;
+
+	ARMJIT_Memory::FaultHandler(&desc);
+
+	if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start)
+	{
+		printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
+			ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x);
+	}
+	else
+	{
+		printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
+	}
+}
+
+}
+#endif
+
+namespace ARMJIT_Memory
+{
+
+#ifdef __aarch64__
+void FaultDescription::RestoreAndRepeat(s64 offset)
+{
+	IntegerRegisters[32] += offset;
+
+	ARM_RestoreContext(IntegerRegisters);
+}
+#else
+void FaultDescription::RestoreAndRepeat(s64 offset)
+{
+	
+}
+#endif
+
+void* FastMem9Start, *FastMem7Start;
+
+const u32 MemoryTotalSize =
+	NDS::MainRAMSize
+	+ NDS::SharedWRAMSize
+	+ NDS::ARM7WRAMSize
+	+ DTCMPhysicalSize;
+
+const u32 MemBlockMainRAMOffset = 0;
+const u32 MemBlockSWRAMOffset = NDS::MainRAMSize;
+const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize;
+const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize;
+
+const u32 OffsetsPerRegion[memregions_Count] =
+{
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockDTCMOffset,
+	UINT32_MAX,
+	MemBlockMainRAMOffset,
+	MemBlockSWRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockARM7WRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+};
+
+enum
+{
+	memstate_Unmapped,
+	memstate_MappedRW,
+	// on switch this is unmapped as well
+	memstate_MappedProtected,
+};
+
+u8 MappingStatus9[1 << (32-12)];
+u8 MappingStatus7[1 << (32-12)];
+
+#ifdef __SWITCH__
+u8* MemoryBase;
+u8* MemoryBaseCodeMem;
+#else
+u8* MemoryBase;
+#endif
+
+bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), 
+		(u64)(MemoryBaseCodeMem + offset), size));
+	return R_SUCCEEDED(r);
+#endif
+}
+
+bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(),
+		(u64)(MemoryBaseCodeMem + offset), size);
+	printf("%x\n", r);
+	return R_SUCCEEDED(r);
+#endif
+}
+
+struct Mapping
+{
+	u32 Addr;
+	u32 Size, LocalOffset;
+	u32 Num;
+
+	void Unmap(int region)
+	{
+		bool skipDTCM = Num == 0 && region != memregion_DTCM;
+		u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7;
+		u32 offset = 0;
+		while (offset < Size)
+		{
+			if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase)
+			{
+				offset += NDS::ARM9->DTCMSize;
+				printf("%x skip\n", NDS::ARM9->DTCMSize);
+			}
+			else
+			{
+				u32 segmentOffset = offset;
+				u8 status = statuses[(Addr + offset) >> 12];
+				while (statuses[(Addr + offset) >> 12] == status
+					&& offset < Size
+					&& (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase))
+				{
+					assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped);
+					statuses[(Addr + offset) >> 12] = memstate_Unmapped;
+					offset += 0x1000;
+				}
+
+				if (status == memstate_MappedRW)
+				{
+					u32 segmentSize = offset - segmentOffset;
+					printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					assert(success);
+				}
+			}
+		}
+	}
+};
+ARMJIT::TinyVector<Mapping> Mappings[memregions_Count];
+
+void SetCodeProtection(int region, u32 offset, bool protect)
+{
+	offset &= ~0xFFF;
+	printf("set code protection %d %x %d\n", region, offset, protect);
+
+	for (int i = 0; i < Mappings[region].Length; i++)
+	{
+		Mapping& mapping = Mappings[region][i];
+
+		u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset);
+		if (mapping.Num == 0
+			&& region != memregion_DTCM 
+			&& effectiveAddr >= NDS::ARM9->DTCMBase
+			&& effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+			continue;
+
+		u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7);
+
+		printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num);
+		assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected));
+		states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW;
+
+		bool success;
+		if (protect)
+			success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		else
+			success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		assert(success);
+	}
+}
+
+void RemapDTCM(u32 newBase, u32 newSize)
+{
+	// this first part could be made more efficient
+	// by unmapping DTCM first and then map the holes
+	u32 oldDTCMBase = NDS::ARM9->DTCMBase;
+	u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize;
+
+	u32 newEnd = newBase + newSize;
+
+	printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd);
+	// unmap all regions containing the old or the current DTCM mapping
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		if (region == memregion_DTCM)
+			continue;
+
+		for (int i = 0; i < Mappings[region].Length;)
+		{
+			Mapping& mapping = Mappings[region][i];
+
+			u32 start = mapping.Addr;
+			u32 end = mapping.Addr + mapping.Size;
+
+			printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset);
+
+			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end));
+			bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end));
+
+			if (mapping.Num == 0 && (oldOverlap || newOverlap))
+			{
+				mapping.Unmap(region);
+				Mappings[region].Remove(i);
+			}
+			else
+			{
+				i++;
+			}
+		}
+	}
+
+	for (int i = 0; i < Mappings[memregion_DTCM].Length; i++)
+	{
+		Mappings[memregion_DTCM][i].Unmap(memregion_DTCM);
+	}
+	Mappings[memregion_DTCM].Clear();
+}
+
+void RemapSWRAM()
+{
+	printf("remapping SWRAM\n");
+	for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++)
+	{
+		Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM);
+	}
+	Mappings[memregion_SWRAM].Clear();
+	for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++)
+	{
+		Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7);
+	}
+	Mappings[memregion_WRAM7].Clear();
+}
+
+bool MapAtAddress(u32 addr)
+{
+	u32 num = NDS::CurCPU;
+
+	int region = num == 0
+		? ClassifyAddress9(addr)
+		: ClassifyAddress7(addr);
+
+	if (!IsMappable(region))
+		return false;
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize);
+
+	if (!isMapped)
+		return false;
+
+	// this calculation even works with DTCM
+	// which doesn't have to be aligned to it's own size
+	u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart;
+
+	u8* states = num == 0 ? MappingStatus9 : MappingStatus7;
+	printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset);
+	bool isExecutable = ARMJIT::CodeMemRegions[region];
+
+	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset;
+
+	// this overcomplicated piece of code basically just finds whole pieces of code memory
+	// which can be mapped
+	u32 offset = 0;	
+	bool skipDTCM = num == 0 && region != memregion_DTCM;
+	while (offset < memorySize)
+	{
+		if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase)
+		{
+			offset += NDS::ARM9->DTCMSize;
+		}
+		else
+		{
+			u32 sectionOffset = offset;
+			bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]);
+			while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode)
+				&& offset < memorySize
+				&& (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase))
+			{
+				assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped);
+				states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW;
+				offset += 0x1000;
+			}
+
+			u32 sectionSize = offset - sectionOffset;
+
+			if (!hasCode)
+			{
+				printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]);
+				bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize);
+				assert(succeded);
+			}
+		}
+	}
+
+	Mapping mapping{mirrorStart, memorySize, memoryOffset, num};
+	Mappings[region].Add(mapping);
+
+	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1);
+
+	return true;
+}
+
+void FaultHandler(FaultDescription* faultDesc)
+{
+	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC()))
+	{
+		bool rewriteToSlowPath = true;
+
+		u32 addr = faultDesc->GetEmulatedAddr();
+
+		if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped)
+			rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr());
+
+		s64 offset = 0;
+		if (rewriteToSlowPath)
+		{
+			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC());
+		}
+		faultDesc->RestoreAndRepeat(offset);
+	}
+}
+
+void Init()
+{
+#if defined(__SWITCH__)
+    MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize);
+	MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        (u64)MemoryBase, MemoryTotalSize));
+    assert(succeded);
+	succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        MemoryTotalSize, Perm_Rw));
+	assert(succeded);
+
+	// 8 GB of address space, just don't ask...
+	FastMem9Start = virtmemReserve(0x100000000);
+	assert(FastMem9Start);
+	FastMem7Start = virtmemReserve(0x100000000);
+	assert(FastMem7Start);
+
+	NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset;
+#else
+	MemoryBase = new u8[MemoryTotalSize];
+
+	NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset;
+#endif
+}
+
+void DeInit()
+{
+#if defined(__SWITCH__)
+	virtmemFree(FastMem9Start, 0x100000000);
+	virtmemFree(FastMem7Start, 0x100000000);
+
+    svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize);
+	virtmemFree(MemoryBaseCodeMem, MemoryTotalSize);
+    free(MemoryBase);
+#else
+	delete[] MemoryBase;
+#endif
+}
+
+void Reset()
+{
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		for (int i = 0; i < Mappings[region].Length; i++)
+			Mappings[region][i].Unmap(region);
+		Mappings[region].Clear();
+	}
+
+	for (int i = 0; i < sizeof(MappingStatus9); i++)
+	{
+		assert(MappingStatus9[i] == memstate_Unmapped);
+		assert(MappingStatus7[i] == memstate_Unmapped);
+	}
+
+	printf("done resetting jit mem\n");
+}
+
+bool IsMappable(int region)
+{
+	return OffsetsPerRegion[region] != UINT32_MAX;
+}
+
+bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize)
+{
+	memoryOffset = 0;
+	switch (region)
+	{
+	case memregion_ITCM:
+		if (num == 0)
+		{
+			mappingStart = 0;
+			mappingSize = NDS::ARM9->ITCMSize;
+			memorySize = ITCMPhysicalSize;
+			return true;
+		}
+		return false;
+	case memregion_DTCM:
+		if (num == 0)
+		{
+			mappingStart = NDS::ARM9->DTCMBase;
+			mappingSize = NDS::ARM9->DTCMSize;
+			memorySize = DTCMPhysicalSize;
+			return true;
+		}
+		return false;
+	case memregion_BIOS9:
+		if (num == 0)
+		{
+			mappingStart = 0xFFFF0000;
+			mappingSize = 0x10000;
+			memorySize = 0x1000;
+			return true;
+		}
+		return false;
+	case memregion_MainRAM:
+		mappingStart = 0x2000000;
+		mappingSize = 0x1000000;
+		memorySize = NDS::MainRAMSize;
+		return true;
+	case memregion_SWRAM:
+		mappingStart = 0x3000000;
+		if (num == 0 && NDS::SWRAM_ARM9.Mem)
+		{
+			mappingSize = 0x1000000;
+			memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM;
+			memorySize = NDS::SWRAM_ARM9.Mask + 1;
+			return true;
+		}
+		else if (num == 1 && NDS::SWRAM_ARM7.Mem)
+		{
+			mappingSize = 0x800000;
+			memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM;
+			memorySize = NDS::SWRAM_ARM7.Mask + 1;
+			return true;
+		}
+		return false;
+	case memregion_VRAM:
+		if (num == 0)
+		{
+			// this is a gross simplification
+			// mostly to make code on vram working
+			// it doesn't take any of the actual VRAM mappings into account
+			mappingStart = 0x6000000;
+			mappingSize = 0x1000000;
+			memorySize = 0x100000;
+			return true;
+		}
+		return false;
+	case memregion_BIOS7:
+		if (num == 1)
+		{
+			mappingStart = 0;
+			mappingSize = 0x4000;
+			memorySize = 0x4000;
+			return true;
+		}
+		return false;
+	case memregion_WRAM7:
+		if (num == 1)
+		{
+			if (NDS::SWRAM_ARM7.Mem)
+			{
+				mappingStart = 0x3800000;
+				mappingSize = 0x800000;
+			}
+			else
+			{
+				mappingStart = 0x3000000;
+				mappingSize = 0x1000000;
+			}
+			memorySize = NDS::ARM7WRAMSize;
+			return true;
+		}
+		return false;
+	case memregion_VWRAM:
+		if (num == 1)
+		{
+			mappingStart = 0x6000000;
+			mappingSize = 0x1000000;
+			memorySize = 0x20000;
+			return true;
+		}
+		return false;
+	default:
+		// for the JIT we don't are about the rest
+		return false;
+	}
+}
+
+int ClassifyAddress9(u32 addr)
+{
+	if (addr < NDS::ARM9->ITCMSize)
+		return memregion_ITCM;
+	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+		return memregion_DTCM;
+	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		return memregion_BIOS9;
+	else
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x02000000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM9.Mem)
+				return memregion_SWRAM;
+			else
+				return memregion_Other;
+		case 0x04000000:
+			return memregion_IO9;
+		case 0x06000000:
+			return memregion_VRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+int ClassifyAddress7(u32 addr)
+{
+	if (addr < 0x00004000)
+		return memregion_BIOS7;
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x02000000:
+		case 0x02800000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM7.Mem)
+				return memregion_SWRAM;
+			else
+				return memregion_WRAM7;
+		case 0x03800000:
+			return memregion_WRAM7;
+		case 0x04000000:
+			return memregion_IO7;
+		case 0x04800000:
+			return memregion_Wifi;
+		case 0x06000000:
+		case 0x06800000:
+			return memregion_VWRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+void WifiWrite32(u32 addr, u32 val)
+{
+	Wifi::Write(addr, val & 0xFFFF);
+	Wifi::Write(addr + 2, val >> 16);
+}
+
+u32 WifiRead32(u32 addr)
+{
+	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
+}
+
+template <typename T>
+void VRAMWrite(u32 addr, T val)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
+	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
+	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
+	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
+	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
+	}
+}
+template <typename T>
+T VRAMRead(u32 addr)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
+	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
+	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
+	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
+	default: return GPU::ReadVRAM_LCDC<T>(addr);
+	}
+}
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x04000000:
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;
+			case 9: return (void*)NDS::ARM9IOWrite8;
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+			break;
+		case 0x06000000:
+			switch (size | store)
+			{
+			case 8: return (void*)VRAMRead<u8>;		
+			case 9: return NULL;
+			case 16: return (void*)VRAMRead<u16>;
+			case 17: return (void*)VRAMWrite<u16>;
+			case 32: return (void*)VRAMRead<u32>;
+			case 33: return (void*)VRAMWrite<u32>;
+			}
+			break;
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size >= 16)
+			{
+				switch (size | store)
+				{
+				case 16: return (void*)Wifi::Read;
+				case 17: return (void*)Wifi::Write;
+				case 32: return (void*)WifiRead32;
+				case 33: return (void*)WifiWrite32;
+				}
+			}
+			break;
+		case 0x06000000:
+		case 0x06800000:
+			switch (size | store)
+			{
+			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
+			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
+			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
+			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
+			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
+			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
+			}
+		}
+	}
+	return NULL;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
new file mode 100644
index 0000000..1a59d98
--- /dev/null
+++ b/src/ARMJIT_Memory.h
@@ -0,0 +1,53 @@
+#ifndef ARMJIT_MEMORY
+#define ARMJIT_MEMORY
+
+#include "types.h"
+
+#include "ARM.h"
+
+namespace ARMJIT_Memory
+{
+
+extern void* FastMem9Start;
+extern void* FastMem7Start;
+
+void Init();
+void DeInit();
+
+void Reset();
+
+enum
+{
+	memregion_Other = 0,
+	memregion_ITCM,
+	memregion_DTCM,
+	memregion_BIOS9,
+	memregion_MainRAM,
+	memregion_SWRAM,
+	memregion_IO9,
+	memregion_VRAM,
+	memregion_BIOS7,
+	memregion_WRAM7,
+	memregion_IO7,
+	memregion_Wifi,
+	memregion_VWRAM,
+	memregions_Count
+};
+
+int ClassifyAddress9(u32 addr);
+int ClassifyAddress7(u32 addr);
+
+bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize);
+
+bool IsMappable(int region);
+
+void RemapDTCM(u32 newBase, u32 newSize);
+void RemapSWRAM();
+
+void SetCodeProtection(int region, u32 offset, bool protect);
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fd3fb70..34c1c91 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -301,24 +301,6 @@ Compiler::Compiler()
         RET();
     }
 
-    {
-        CPSRDirty = true;
-        BranchStub[0] = GetWritableCodePtr();
-        SaveCPSR();
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        CALL((u8*)ARMJIT::LinkBlock<0>);
-        LoadCPSR();
-        JMP((u8*)ARM_Ret, true);
-
-        CPSRDirty = true;
-        BranchStub[1] = GetWritableCodePtr();
-        SaveCPSR();
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        CALL((u8*)ARMJIT::LinkBlock<1>);
-        LoadCPSR();
-        JMP((u8*)ARM_Ret, true);
-    }
-
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -520,6 +502,11 @@ void Compiler::Reset()
     FarCode = FarStart;
 }
 
+bool Compiler::IsJITFault(u64 addr)
+{
+    return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory);
+}
+
 void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
     if (taken && CurInstr.BranchFlags & branch_IdleBranch)
@@ -531,32 +518,11 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken)
         RegCache.PrepareExit();
 
         SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
-
-        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
-            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
-        {
-            FixupBranch ret = J_CC(CC_S);
-            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
-            FixupBranch ret2 = J_CC(CC_NZ);
-
-            u8* rewritePart = GetWritableCodePtr();
-            NOP(5);
-
-            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
-            JMP((u8*)BranchStub[Num], true);
-
-            SetJumpTarget(ret);
-            SetJumpTarget(ret2);
-            JMP((u8*)ARM_Ret, true);
-        }
-        else
-        {
-            JMP((u8*)&ARM_Ret, true);
-        }
+        JMP((u8*)&ARM_Ret, true);
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess...
     {
@@ -575,7 +541,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
     // CPSR might have been modified in a previous block
-    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
+    CPSRDirty = false;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
@@ -685,31 +651,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     RegCache.Flush();
 
     SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
-
-    if (Config::JIT_BrancheOptimisations == 2
-        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
-        && (!instrs[instrsCount - 1].Info.Branches()
-        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
-        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
-    {
-        FixupBranch ret = J_CC(CC_S);
-        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
-        FixupBranch ret2 = J_CC(CC_NZ);
-
-        u8* rewritePart = GetWritableCodePtr();
-        NOP(5);
-
-        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
-        JMP((u8*)BranchStub[Num], true);
-
-        SetJumpTarget(ret);
-        SetJumpTarget(ret2);
-        JMP((u8*)ARM_Ret, true);
-    }
-    else
-    {
-        JMP((u8*)ARM_Ret, true);
-    }
+    JMP((u8*)ARM_Ret, true);
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -720,22 +662,6 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     return res;
 }
 
-void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
-{
-    u8* curPtr = GetWritableCodePtr();
-    SetCodePtr(ResetStart + offset);
-    JMP((u8*)entry, true);
-    SetCodePtr(curPtr);
-}
-
-void Compiler::UnlinkBlock(u32 offset)
-{
-    u8* curPtr = GetWritableCodePtr();
-    SetCodePtr(ResetStart + offset);
-    NOP(5);
-    SetCodePtr(curPtr);
-}
-
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index f2fc301..09ac257 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -52,10 +52,7 @@ public:
 
     void Reset();
 
-    void LinkBlock(u32 offset, JitBlockEntry entry);
-    void UnlinkBlock(u32 offset);
-
-    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -202,6 +199,10 @@ public:
         SetCodePtr(FarCode);
     }
 
+    bool IsJITFault(u64 addr);
+
+    s32 RewriteMemAccess(u64 pc);
+
     u8* FarCode;
     u8* NearCode;
     u32 FarSize;
@@ -216,8 +217,6 @@ public:
     bool Exit;
     bool IrregularCycles;
 
-    void* BranchStub[2];
-
     void* ReadBanked;
     void* WriteBanked;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index cf0bd23..0bf2f83 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -15,6 +15,11 @@ int squeezePointer(T* ptr)
     return truncated;
 }
 
+s32 Compiler::RewriteMemAccess(u64 pc)
+{
+    return 0;
+}
+
 /*
     According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
     of all memory load and store instructions always access addresses in the same region as
@@ -27,14 +32,15 @@ int squeezePointer(T* ptr)
 
 bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
 {
-    u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
+    return false;
+    //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
 
-    int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
+    /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
     if (invalidLiteralIdx != -1)
     {
         InvalidLiterals.Remove(invalidLiteralIdx);
         return false;
-    }
+    }*/
 
     u32 val;
     // make sure arm7 bios is accessible
@@ -95,7 +101,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         OpArg rdMapped = MapReg(rd);
 
-        if (!addrIsStatic)
+        if (true)
         {
             OpArg rnMapped = MapReg(rn);
             if (Thumb && rn == 15)
@@ -145,7 +151,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 MOV(32, rnMapped, R(finalAddr));
         }
 
-        int expectedTarget = Num == 0
+        /*int expectedTarget = Num == 0
             ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
             : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
         if (CurInstr.Cond() < 0xE)
@@ -184,8 +190,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         if (addrIsStatic && compileSlowPath)
             MOV(32, R(RSCRATCH3), Imm32(staticAddress));
-
-        if (compileFastPath)
+*/
+        /*if (compileFastPath)
         {
             FixupBranch slowPath;
             if (compileSlowPath)
@@ -357,15 +363,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 SetJumpTarget(slowPath);
             }
         }
-
-        if (compileSlowPath)
+*/
+        if (true)
         {
             PushRegs(false);
 
             if (Num == 0)
             {
-                MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
-                MOV(64, R(ABI_PARAM1), R(RCPU));
+                MOV(64, R(ABI_PARAM2), R(RCPU));
+                if (ABI_PARAM1 != RSCRATCH3)
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
                 if (flags & memop_Store)
                 {
                     MOV(32, R(ABI_PARAM3), rdMapped);
@@ -423,13 +430,13 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
             }
         }
-
+/*
         if (compileFastPath && compileSlowPath)
         {
             FixupBranch ret = J(true);
             SwitchToNearCode();
             SetJumpTarget(ret);
-        }
+        }*/
 
         if (!(flags & memop_Store) && rd == 15)
         {
@@ -458,7 +465,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 #endif
     u32 allocOffset = stackAlloc - regsCount * 8;
-
+/*
     int expectedTarget = Num == 0
         ? ClassifyAddress9(CurInstr.DataRegion)
         : ClassifyAddress7(CurInstr.DataRegion);
@@ -479,7 +486,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     default:
         break;
     }
-
+*/
     if (!store)
         Comp_AddCycles_CDI();
     else
@@ -492,7 +499,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
         MOV(32, R(RSCRATCH4), MapReg(rn));
-
+/*
     if (compileFastPath)
     {
         assert(!usermode);
@@ -570,7 +577,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
         SwitchToFarCode();
         SetJumpTarget(slowPath);
-    }
+    }*/
 
     if (!store)
     {
@@ -696,13 +703,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     
         PopRegs(false);
     }
-
+/*
     if (compileFastPath)
     {
         FixupBranch ret = J(true);
         SwitchToNearCode();
         SetJumpTarget(ret);
-    }
+    }*/
 
     if (!store && regs[15])
     {
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b50e821..ccec951 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -206,15 +206,14 @@ enum {
     T_ReadR14       = 1 << 13,
     T_WriteR14      = 1 << 14,
 
-    T_PopPC         = 1 << 15,
-
-    T_SetNZ         = 1 << 16,
-    T_SetCV         = 1 << 17,
-    T_SetMaybeC     = 1 << 18,
-    T_ReadC         = 1 << 19,
-    T_SetC          = 1 << 20,
+    T_SetNZ         = 1 << 15,
+    T_SetCV         = 1 << 16,
+    T_SetMaybeC     = 1 << 17,
+    T_ReadC         = 1 << 18,
+    T_SetC          = 1 << 19,
     
-    T_WriteMem      = 1 << 21,
+    T_WriteMem      = 1 << 20,
+    T_LoadMem       = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -256,31 +255,31 @@ const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
 const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
-const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
+const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL);
 
 const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
 const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
-const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
-const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG);
 const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
-const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
-const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
-const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG);
 
 const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
-const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM);
 const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
-const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM);
 const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
-const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM);
 
 const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
-const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL);
 
 const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
-const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
+const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP);
 
-const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
+const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA);
 const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
@@ -347,7 +346,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_BranchAlways)
             res.DstRegs |= (1 << 15);
 
-        if (data & T_PopPC && instr & (1 << 8))
+        if (res.Kind == tk_POP && instr & (1 << 8))
             res.DstRegs |= 1 << 15;
 
         if (data & T_SetNZ)
@@ -364,11 +363,18 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_WriteMem)
             res.SpecialKind = special_WriteMem;
         
-        if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+        if (data & T_LoadMem)
         {
-            if (!Config::JIT_LiteralOptimisations)
-                res.SrcRegs |= 1 << 15;
-            res.SpecialKind = special_LoadLiteral;
+            if (res.Kind == tk_LDR_PCREL)
+            {
+                if (!Config::JIT_LiteralOptimisations)
+                    res.SrcRegs |= 1 << 15;
+                res.SpecialKind = special_LoadLiteral;
+            }
+            else
+            {
+                res.SpecialKind = special_LoadMem;
+            }
         }
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
@@ -401,11 +407,17 @@ Info Decode(bool thumb, u32 num, u32 instr)
         else if ((instr >> 28) == 0xF)
             data = ak(ak_Nop);
 
-        if (data & A_UnkOnARM7 && num != 0)
+        if (data & A_UnkOnARM7 && num == 1)
             data = A_UNK;
 
         res.Kind = (data >> 22) & 0x1FF;
 
+        if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1)
+        {
+            data = ak(ak_Nop);
+            res.Kind = ak_Nop;
+        }
+
         if (res.Kind == ak_MCR)
         {
             u32 cn = (instr >> 16) & 0xF;
@@ -490,8 +502,13 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_WriteMem)
             res.SpecialKind = special_WriteMem;
 
-        if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
-            res.SpecialKind = special_LoadLiteral;
+        if (data & A_LoadMem)
+        {
+            if (res.SrcRegs == (1 << 15))
+               res.SpecialKind = special_LoadLiteral;
+            else
+                res.SpecialKind = special_LoadMem;
+        }
         
         if (res.Kind == ak_LDM)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 6ab4929..a702435 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -232,6 +232,7 @@ enum
 {
     special_NotSpecialAtAll = 0,
     special_WriteMem,
+    special_LoadMem,
     special_WaitForInterrupt,
     special_LoadLiteral
 };
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f35b3e9..84bbc2b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,9 +55,11 @@ if (ENABLE_JIT)
 	enable_language(ASM)
 
 	target_sources(core PRIVATE
-		ARMJIT.cpp
 		ARM_InstrInfo.cpp
 
+		ARMJIT.cpp
+		ARMJIT_Memory.cpp
+
 		dolphin/CommonFuncs.cpp
 	)
 
@@ -85,6 +87,8 @@ if (ENABLE_JIT)
 			ARMJIT_A64/ARMJIT_ALU.cpp
 			ARMJIT_A64/ARMJIT_LoadStore.cpp
 			ARMJIT_A64/ARMJIT_Branch.cpp
+
+			ARMJIT_A64/ARMJIT_Linkage.s
 		)
 	endif()
 endif()
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 225847e..3d64259 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -22,6 +22,7 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 
 // access timing for cached regions
@@ -42,8 +43,8 @@ void ARMv5::CP15Reset()
     DTCMSetting = 0;
     ITCMSetting = 0;
 
-    memset(ITCM, 0, 0x8000);
-    memset(DTCM, 0, 0x4000);
+    memset(ITCM, 0, ITCMPhysicalSize);
+    memset(DTCM, 0, DTCMPhysicalSize);
 
     ITCMSize = 0;
     DTCMBase = 0xFFFFFFFF;
@@ -75,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file)
     file->Var32(&DTCMSetting);
     file->Var32(&ITCMSetting);
 
-    file->VarArray(ITCM, 0x8000);
-    file->VarArray(DTCM, 0x4000);
+    file->VarArray(ITCM, ITCMPhysicalSize);
+    file->VarArray(DTCM, DTCMPhysicalSize);
 
     file->Var32(&PU_CodeCacheable);
     file->Var32(&PU_DataCacheable);
@@ -98,36 +99,30 @@ void ARMv5::CP15DoSavestate(Savestate* file)
 
 void ARMv5::UpdateDTCMSetting()
 {
-#ifdef JIT_ENABLED
-    u32 oldDTCMBase = DTCMBase;
-    u32 oldDTCMSize = DTCMSize;
-#endif
+    u32 newDTCMBase;
+    u32 newDTCMSize;
     if (CP15Control & (1<<16))
     {
-        DTCMBase = DTCMSetting & 0xFFFFF000;
-        DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
+        newDTCMBase = DTCMSetting & 0xFFFFF000;
+        newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
         //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize);
     }
     else
     {
-        DTCMBase = 0xFFFFFFFF;
-        DTCMSize = 0;
+        newDTCMBase = 0xFFFFFFFF;
+        newDTCMSize = 0;
         //printf("DTCM disabled\n");
     }
-#ifdef JIT_ENABLED
-    if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize)
+    if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize)
     {
-        ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize);
-        ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize);
+        ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize);
+        DTCMBase = newDTCMBase;
+        DTCMSize = newDTCMSize;
     }
-#endif
 }
 
 void ARMv5::UpdateITCMSetting()
 {
-#ifdef JIT_ENABLED
-    u32 oldITCMSize = ITCMSize;
-#endif
     if (CP15Control & (1<<18))
     {
         ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F);
@@ -138,10 +133,6 @@ void ARMv5::UpdateITCMSetting()
         ITCMSize = 0;
         //printf("ITCM disabled\n");
     }
-#ifdef JIT_ENABLED
-    if (oldITCMSize != ITCMSize)
-        ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize));
-#endif
 }
 
 
@@ -581,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
     case 0x750:
         ICacheInvalidateAll();
+        //Halt(255);
         return;
     case 0x751:
         ICacheInvalidateByAddr(val);
+        //Halt(255);
         return;
     case 0x752:
         printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val);
+        //Halt(255);
         return;
 
 
@@ -723,7 +717,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
     if (addr < ITCMSize)
     {
         CodeCycles = 1;
-        return *(u32*)&ITCM[addr & 0x7FFF];
+        return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
     }
 
     CodeCycles = RegionCodeCycles;
@@ -750,13 +744,13 @@ void ARMv5::DataRead8(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u8*)&ITCM[addr & 0x7FFF];
+        *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -773,13 +767,13 @@ void ARMv5::DataRead16(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u16*)&ITCM[addr & 0x7FFF];
+        *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -796,13 +790,13 @@ void ARMv5::DataRead32(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -817,13 +811,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -838,16 +832,16 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u8*)&ITCM[addr & 0x7FFF] = val;
+        *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -864,16 +858,16 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u16*)&ITCM[addr & 0x7FFF] = val;
+        *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -890,16 +884,16 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -914,16 +908,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
diff --git a/src/Config.cpp b/src/Config.cpp
index 22e9c11..edf84f2 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -47,8 +47,9 @@ int JIT_LiteralOptimisations = true;
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
 int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = 2;
+int JIT_BrancheOptimisations = true;
 int JIT_LiteralOptimisations = true;
+int JIT_FastMemory = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -72,8 +73,9 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
+    {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 31fa67a..7b19a4b 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -63,6 +63,7 @@ extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern int JIT_BrancheOptimisations;
 extern int JIT_LiteralOptimisations;
+extern int JIT_FastMemory;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 657241f..3d65482 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -33,6 +33,7 @@
 #include "AREngine.h"
 #include "Platform.h"
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -94,17 +95,17 @@ u32 CPUStop;
 u8 ARM9BIOS[0x1000];
 u8 ARM7BIOS[0x4000];
 
-u8 MainRAM[0x1000000];
+u8* MainRAM;
 u32 MainRAMMask;
 
-u8 SharedWRAM[0x8000];
+u8* SharedWRAM;
 u8 WRAMCnt;
-u8* SWRAM_ARM9;
-u8* SWRAM_ARM7;
-u32 SWRAM_ARM9Mask;
-u32 SWRAM_ARM7Mask;
 
-u8 ARM7WRAM[0x10000];
+// putting them together so they're always next to each other
+MemRegion SWRAM_ARM9;
+MemRegion SWRAM_ARM7;
+
+u8* ARM7WRAM;
 
 u16 ExMemCnt[2];
 
@@ -171,6 +172,10 @@ bool Init()
 
 #ifdef JIT_ENABLED
     ARMJIT::Init();
+#else
+    MainRAM = new u8[MainRAMSize];
+    ARM7WRAM = new u8[ARM7WRAMSize];
+    SharedWRAM = new u8[SharedWRAMSize];
 #endif
 
     DMAs[0] = new DMA(0, 0);
@@ -485,6 +490,10 @@ void Reset()
         printf("ARM7 BIOS loaded\n");
         fclose(f);
     }
+    
+#ifdef JIT_ENABLED
+    ARMJIT::Reset();
+#endif
 
     if (ConsoleType == 1)
     {
@@ -510,7 +519,7 @@ void Reset()
 
     InitTimings();
 
-    memset(MainRAM, 0, 0x1000000);
+    memset(MainRAM, 0, MainRAMMask + 1);
     memset(SharedWRAM, 0, 0x8000);
     memset(ARM7WRAM, 0, 0x10000);
 
@@ -587,10 +596,6 @@ void Reset()
     }
 
     AREngine::Reset();
-
-#ifdef JIT_ENABLED
-    ARMJIT::Reset();
-#endif
 }
 
 void Stop()
@@ -705,7 +710,7 @@ bool DoSavestate(Savestate* file)
 
     file->VarArray(MainRAM, 0x400000);
     file->VarArray(SharedWRAM, 0x8000);
-    file->VarArray(ARM7WRAM, 0x10000);
+    file->VarArray(ARM7WRAM, ARM7WRAMSize);
 
     file->VarArray(ExMemCnt, 2*sizeof(u16));
     file->VarArray(ROMSeed0, 2*8);
@@ -1128,43 +1133,40 @@ void MapSharedWRAM(u8 val)
     if (val == WRAMCnt)
         return;
 
+    ARMJIT_Memory::RemapSWRAM();
+
     WRAMCnt = val;
 
     switch (WRAMCnt & 0x3)
     {
     case 0:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x7FFF;
-        SWRAM_ARM7 = NULL;
-        SWRAM_ARM7Mask = 0;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x7FFF;
+        SWRAM_ARM7.Mem = NULL;
+        SWRAM_ARM7.Mask = 0;
         break;
 
     case 1:
-        SWRAM_ARM9 = &SharedWRAM[0x4000];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 2:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0x4000];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 3:
-        SWRAM_ARM9 = NULL;
-        SWRAM_ARM9Mask = 0;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x7FFF;
+        SWRAM_ARM9.Mem = NULL;
+        SWRAM_ARM9.Mask = 0;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x7FFF;
         break;
     }
-
-#ifdef JIT_ENABLED
-    ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000);
-    ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000);
-#endif
 }
 
 
@@ -1835,12 +1837,12 @@ u8 ARM9Read8(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u8*)&MainRAM[addr & MainRAMMask];
+        return *(u8*)&MainRAM[addr & (MainRAMSize - 1)];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1900,12 +1902,12 @@ u16 ARM9Read16(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u16*)&MainRAM[addr & MainRAMMask];
+        return *(u16*)&MainRAM[addr & (MainRAMSize - 1)];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1968,9 +1970,9 @@ u32 ARM9Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -2026,7 +2028,7 @@ void ARM9Write8(u32 addr, u8 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2035,12 +2037,12 @@ void ARM9Write8(u32 addr, u8 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2085,7 +2087,7 @@ void ARM9Write16(u32 addr, u16 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2094,12 +2096,12 @@ void ARM9Write16(u32 addr, u16 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2113,18 +2115,16 @@ void ARM9Write16(u32 addr, u16 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u16>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return;
-        default:
-#ifdef JIT_ENABLED
-            ARMJIT::InvalidateLCDCIfNecessary(addr);
-#endif
-            GPU::WriteVRAM_LCDC<u16>(addr, val);
-            return;
+        default: GPU::WriteVRAM_LCDC<u16>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2165,7 +2165,7 @@ void ARM9Write32(u32 addr, u32 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2174,12 +2174,12 @@ void ARM9Write32(u32 addr, u32 val)
         return ;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2193,18 +2193,16 @@ void ARM9Write32(u32 addr, u32 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:
-#ifdef JIT_ENABLED
-            ARMJIT::InvalidateLCDCIfNecessary(addr);
-#endif
-            GPU::WriteVRAM_LCDC<u32>(addr, val);
-            return;
+        default: GPU::WriteVRAM_LCDC<u32>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2250,10 +2248,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
         return true;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            region->Mem = SWRAM_ARM9;
-            region->Mask = SWRAM_ARM9Mask;
+            region->Mem = SWRAM_ARM9.Mem;
+            region->Mask = SWRAM_ARM9.Mask;
             return true;
         }
         break;
@@ -2292,17 +2290,17 @@ u8 ARM7Read8(u32 addr)
         return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead8(addr);
@@ -2352,17 +2350,17 @@ u16 ARM7Read16(u32 addr)
         return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead16(addr);
@@ -2419,17 +2417,17 @@ u32 ARM7Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead32(addr);
@@ -2474,7 +2472,7 @@ void ARM7Write8(u32 addr, u8 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2483,28 +2481,28 @@ void ARM7Write8(u32 addr, u8 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2514,7 +2512,7 @@ void ARM7Write8(u32 addr, u8 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u8>(addr, val);
         return;
@@ -2551,7 +2549,7 @@ void ARM7Write16(u32 addr, u16 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2560,28 +2558,28 @@ void ARM7Write16(u32 addr, u16 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2599,7 +2597,7 @@ void ARM7Write16(u32 addr, u16 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u16>(addr, val);
         return;
@@ -2638,7 +2636,7 @@ void ARM7Write32(u32 addr, u32 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2647,28 +2645,28 @@ void ARM7Write32(u32 addr, u32 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2687,7 +2685,7 @@ void ARM7Write32(u32 addr, u32 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u32>(addr, val);
         return;
@@ -2736,17 +2734,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region)
         // then access all the WRAM as one contiguous block starting at 0x037F8000
         // this case needs a bit of a hack to cover
         // it's not really worth bothering anyway
-        if (!SWRAM_ARM7)
+        if (!SWRAM_ARM7.Mem)
         {
             region->Mem = ARM7WRAM;
-            region->Mask = 0xFFFF;
+            region->Mask = ARM7WRAMSize-1;
             return true;
         }
         break;
 
     case 0x03800000:
         region->Mem = ARM7WRAM;
-        region->Mask = 0xFFFF;
+        region->Mask = ARM7WRAMSize-1;
         return true;
     }
 
diff --git a/src/NDS.h b/src/NDS.h
index e9b56da..4b4f9a1 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -134,6 +134,7 @@ typedef struct
 } MemRegion;
 
 extern int ConsoleType;
+extern int CurCPU;
 
 extern u8 ARM9MemTimings[0x40000][4];
 extern u8 ARM7MemTimings[0x20000][4];
@@ -161,20 +162,20 @@ extern u8 ARM9BIOS[0x1000];
 extern u8 ARM7BIOS[0x4000];
 extern u16 ARM7BIOSProt;
 
-extern u8 MainRAM[0x1000000];
+extern u8* MainRAM;
 extern u32 MainRAMMask;
-extern u8 SharedWRAM[0x8000];
 
-extern u8* SWRAM_ARM9;
-extern u8* SWRAM_ARM7;
-extern u32 SWRAM_ARM9Mask;
-extern u32 SWRAM_ARM7Mask;
-
-extern u8 ARM7WRAM[0x10000];
+const u32 SharedWRAMSize = 0x8000;
+extern u8* SharedWRAM;
 
+extern MemRegion SWRAM_ARM9;
+extern MemRegion SWRAM_ARM7;
 
 extern u32 KeyInput;
 
+const u32 ARM7WRAMSize = 0x10000;
+extern u8* ARM7WRAM;
+
 bool Init();
 void DeInit();
 void Reset();
-- 
cgit v1.2.3


From ea6d03581b689738d0d1930b28d1588019cf4077 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Mon, 15 Jun 2020 15:51:19 +0200
Subject: make literal optimisation work again enable single register block
 load/store optimisations for x64 aswell

---
 src/ARMJIT_x64/ARMJIT_Compiler.h    | 12 +++----
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 72 ++++++++++++++++++++++++-------------
 2 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'src')

diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 09ac257..d1a6c07 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -18,15 +18,15 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 const Gen::X64Reg RSCRATCH4 = Gen::R8;
 
-struct ComplexOperand
+struct Op2
 {
-    ComplexOperand()
+    Op2()
     {}
 
-    ComplexOperand(u32 imm)
+    Op2(u32 imm)
         : IsImm(true), Imm(imm)
     {}
-    ComplexOperand(int reg, int op, int amount)
+    Op2(int reg, int op, int amount)
         : IsImm(false)
     {
         Reg.Reg = reg;
@@ -135,9 +135,9 @@ public:
         memop_Store = 1 << 3,
         memop_SubtractOffset = 1 << 4
     };
-    void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
+    void Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
-    bool Comp_MemLoadLiteral(int size, int rd, u32 addr);
+    bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 0bf2f83..b780c55 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -30,17 +30,18 @@ s32 Compiler::RewriteMemAccess(u64 pc)
     improvement.
 */
 
-bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
 {
-    return false;
-    //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
+    u32 localAddr = LocaliseCodeAddress(Num, addr);
 
-    /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
+    int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
     if (invalidLiteralIdx != -1)
     {
         InvalidLiterals.Remove(invalidLiteralIdx);
         return false;
-    }*/
+    }
+
+    Comp_AddCycles_CDI();
 
     u32 val;
     // make sure arm7 bios is accessible
@@ -52,23 +53,29 @@ bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
         val = ROR(val, (addr & 0x3) << 3);
     }
     else if (size == 16)
+    {
         CurCPU->DataRead16(addr & ~0x1, &val);
+        if (signExtend)
+            val = ((s32)val << 16) >> 16;
+    }
     else
+    {
         CurCPU->DataRead8(addr, &val);
+        if (signExtend)
+            val = ((s32)val << 24) >> 24;
+    }
     CurCPU->R[15] = tmpR15;
 
     MOV(32, MapReg(rd), Imm32(val));
 
     if (Thumb || CurInstr.Cond() == 0xE)
         RegCache.PutLiteral(rd, val);
-
-    Comp_AddCycles_CDI();
-
+    
     return true;
 }
 
 
-void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
+void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags)
 {
     u32 addressMask = ~0;
     if (size == 32)
@@ -76,11 +83,11 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         
-        if (Comp_MemLoadLiteral(size, rd, addr))
+        if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
             return;
     }
 
@@ -455,6 +462,23 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 {
     int regsCount = regs.Count();
 
+    if (regsCount == 0)
+        return 0; // actually not the right behaviour TODO: fix me
+
+    if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
+    {
+        int flags = 0;
+        if (store)
+            flags |= memop_Store;
+        if (decrement)
+            flags |= memop_SubtractOffset;
+        Op2 offset = preinc ? Op2(4) : Op2(0);
+
+        Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+        return decrement ? -4 : 4;
+    }
+
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
     // we need to make sure that the stack stays aligned to 16 bytes
@@ -743,10 +767,10 @@ void Compiler::A_Comp_MemWB()
     if (!(CurInstr.Instr & (1 << 23)))
         flags |= memop_SubtractOffset;
 
-    ComplexOperand offset;
+    Op2 offset;
     if (!(CurInstr.Instr & (1 << 25)))
     {
-        offset = ComplexOperand(CurInstr.Instr & 0xFFF);
+        offset = Op2(CurInstr.Instr & 0xFFF);
     }
     else
     {
@@ -754,7 +778,7 @@ void Compiler::A_Comp_MemWB()
         int amount = (CurInstr.Instr >> 7) & 0x1F;
         int rm = CurInstr.A_Reg(0);
 
-        offset = ComplexOperand(rm, op, amount);
+        offset = Op2(rm, op, amount);
     }
 
     Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
@@ -762,9 +786,9 @@ void Compiler::A_Comp_MemWB()
 
 void Compiler::A_Comp_MemHalf()
 {
-    ComplexOperand offset = CurInstr.Instr & (1 << 22)
-        ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
-        : ComplexOperand(CurInstr.A_Reg(0), 0, 0);
+    Op2 offset = CurInstr.Instr & (1 << 22)
+        ? Op2(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : Op2(CurInstr.A_Reg(0), 0, 0);
 
     int op = (CurInstr.Instr >> 5) & 0x3;
     bool load = CurInstr.Instr & (1 << 20);
@@ -806,7 +830,7 @@ void Compiler::T_Comp_MemReg()
     bool load = op & 0x2;
     bool byte = op & 0x1;
 
-    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), 
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), 
         byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
@@ -839,7 +863,7 @@ void Compiler::T_Comp_MemImm()
     bool byte = op & 0x2;
     u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset),
         byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
@@ -856,7 +880,7 @@ void Compiler::T_Comp_MemRegHalf()
     if (!load)
         flags |= memop_Store;
 
-    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0),
         size, flags);
 }
 
@@ -865,7 +889,7 @@ void Compiler::T_Comp_MemImmHalf()
     u32 offset = (CurInstr.Instr >> 5) & 0x3E;
     bool load = CurInstr.Instr & (1 << 11);
 
-    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
         load ? 0 : memop_Store);
 }
 
@@ -873,8 +897,8 @@ void Compiler::T_Comp_LoadPCRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) << 2;
     u32 addr = (R15 & ~0x2) + offset;
-    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr))
-        Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
@@ -882,7 +906,7 @@ void Compiler::T_Comp_MemSPRel()
     u32 offset = (CurInstr.Instr & 0xFF) * 4;
     bool load = CurInstr.Instr & (1 << 11);
 
-    Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32,
         load ? 0 : memop_Store);
 }
 
-- 
cgit v1.2.3


From c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 30 Jun 2020 23:50:41 +0200
Subject: reconcile DSi and JIT, fastmem for x64 and Windows

---
 src/ARM.cpp                                |   23 +-
 src/ARM.h                                  |    2 +-
 src/ARMJIT.cpp                             |  273 +--
 src/ARMJIT.h                               |    2 +
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp        |    4 +-
 src/ARMJIT_Internal.h                      |   12 +-
 src/ARMJIT_Memory.cpp                      |  636 ++++--
 src/ARMJIT_Memory.h                        |   16 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp         |  109 +
 src/ARMJIT_x64/ARMJIT_Compiler.h           |   14 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp        |  632 +++---
 src/CP15.cpp                               |   21 +
 src/Config.cpp                             |   20 +-
 src/Config.h                               |    9 +-
 src/DSi.cpp                                |  167 +-
 src/DSi.h                                  |   15 +
 src/DSi_I2C.cpp                            |    4 +-
 src/NDS.cpp                                |   41 +-
 src/NDS.h                                  |    2 +
 src/frontend/qt_sdl/EmuSettingsDialog.cpp  |  115 +-
 src/frontend/qt_sdl/EmuSettingsDialog.h    |    5 +-
 src/frontend/qt_sdl/EmuSettingsDialog.ui   |  598 +++---
 src/frontend/qt_sdl/main.cpp               |    9 +-
 src/frontend/qt_sdl/main.h                 |    1 +
 src/libui_sdl/DlgEmuSettings.cpp           |  252 ---
 src/libui_sdl/libui/ui.h                   |  764 -------
 src/libui_sdl/libui/unix/stddialogs.c      |  126 --
 src/libui_sdl/libui/windows/stddialogs.cpp |  180 --
 src/libui_sdl/main.cpp                     | 3061 ----------------------------
 29 files changed, 1656 insertions(+), 5457 deletions(-)
 delete mode 100644 src/libui_sdl/DlgEmuSettings.cpp
 delete mode 100644 src/libui_sdl/libui/ui.h
 delete mode 100644 src/libui_sdl/libui/unix/stddialogs.c
 delete mode 100644 src/libui_sdl/libui/windows/stddialogs.cpp
 delete mode 100644 src/libui_sdl/main.cpp

(limited to 'src')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index e529be8..8530795 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,12 +21,15 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMInterpreter.h"
-#include "ARMJIT.h"
 #include "Config.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
 #include "Config.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
 
 // instruction timing notes
 //
@@ -109,6 +112,12 @@ void ARM::Reset()
 
     CodeMem.Mem = NULL;
 
+#ifdef JIT_ENABLED
+    FastBlockLookup = NULL;
+    FastBlockLookupStart = 0;
+    FastBlockLookupSize = 0;
+#endif
+
     // zorp
     JumpTo(ExceptionBase);
 }
@@ -752,6 +761,12 @@ void ARMv4::Execute()
 
     if (Halted == 2)
         Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
 }
 
 #ifdef JIT_ENABLED
@@ -820,6 +835,12 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
 }
 #endif
 
diff --git a/src/ARM.h b/src/ARM.h
index b7f16d6..0248e26 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -147,7 +147,7 @@ public:
     NDS::MemRegion CodeMem;
 
 #ifdef JIT_ENABLED
-    u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0;
+    u32 FastBlockLookupStart, FastBlockLookupSize;
     u64* FastBlockLookup;
 #endif
 
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 53b28c1..2a61c38 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -18,6 +18,7 @@
 #include "ARMInterpreter_Branch.h"
 #include "ARMInterpreter.h"
 
+#include "DSi.h"
 #include "GPU.h"
 #include "GPU3D.h"
 #include "SPU.h"
@@ -38,25 +39,35 @@ namespace ARMJIT
 Compiler* JITCompiler;
 
 AddressRange CodeIndexITCM[ITCMPhysicalSize / 512];
-AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512];
+AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512];
 AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512];
 AddressRange CodeIndexVRAM[0x100000 / 512];
 AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512];
 AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512];
 AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512];
 AddressRange CodeIndexARM7WVRAM[0x40000 / 512];
+AddressRange CodeIndexBIOS9DSi[0x10000 / 512];
+AddressRange CodeIndexBIOS7DSi[0x10000 / 512];
+AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512];
 
 std::unordered_map<u32, JitBlock*> JitBlocks9;
 std::unordered_map<u32, JitBlock*> JitBlocks7;
 
 u64 FastBlockLookupITCM[ITCMPhysicalSize / 2];
-u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2];
+u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2];
 u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2];
 u64 FastBlockLookupVRAM[0x100000 / 2];
 u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2];
 u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2];
 u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2];
 u64 FastBlockLookupARM7WVRAM[0x40000 / 2];
+u64 FastBlockLookupBIOS9DSi[0x10000 / 2];
+u64 FastBlockLookupBIOS7DSi[0x10000 / 2];
+u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2];
 
 const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 {
@@ -64,7 +75,7 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 	ITCMPhysicalSize,
 	0,
 	sizeof(NDS::ARM9BIOS),
-	NDS::MainRAMSize,
+	NDS::MainRAMMaxSize,
 	NDS::SharedWRAMSize,
 	0,
 	0x100000,
@@ -73,6 +84,11 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 	0,
 	0,
 	0x40000,
+	0x10000,
+	0x10000,
+	sizeof(DSi::NWRAM_A),
+	sizeof(DSi::NWRAM_B),
+	sizeof(DSi::NWRAM_C),
 };
 
 AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
@@ -90,6 +106,11 @@ AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
 	NULL,
 	NULL,
 	CodeIndexARM7WVRAM,
+	CodeIndexBIOS9DSi,
+	CodeIndexBIOS7DSi,
+	CodeIndexNWRAM_A,
+	CodeIndexNWRAM_B,
+	CodeIndexNWRAM_C
 };
 
 u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
@@ -106,7 +127,12 @@ u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
 	FastBlockLookupARM7WRAM,
 	NULL,
 	NULL,
-	FastBlockLookupARM7WVRAM
+	FastBlockLookupARM7WVRAM,
+	FastBlockLookupBIOS9DSi,
+	FastBlockLookupBIOS7DSi,
+	FastBlockLookupNWRAM_A,
+	FastBlockLookupNWRAM_B,
+	FastBlockLookupNWRAM_C
 };
 
 u32 LocaliseCodeAddress(u32 num, u32 addr)
@@ -115,21 +141,14 @@ u32 LocaliseCodeAddress(u32 num, u32 addr)
 		? ARMJIT_Memory::ClassifyAddress9(addr)
 		: ARMJIT_Memory::ClassifyAddress7(addr);
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
-		mappingSize, memoryOffset, memorySize)
-		&& CodeMemRegions[region])
-	{
-		addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
-		addr |= (u32)region << 28;
-		return addr;
-	}
+	if (CodeMemRegions[region])
+		return ARMJIT_Memory::LocaliseAddress(region, num, addr);
 	return 0;
 }
 
 TinyVector<u32> InvalidLiterals;
 
-template <typename T>
+template <typename T, int ConsoleType>
 T SlowRead9(u32 addr, ARMv5* cpu)
 {
 	u32 offset = addr & 0x3;
@@ -141,11 +160,11 @@ T SlowRead9(u32 addr, ARMv5* cpu)
 	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
 		val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF];
 	else if (std::is_same<T, u32>::value)
-		val = NDS::ARM9Read32(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr);
 	else if (std::is_same<T, u16>::value)
-		val = NDS::ARM9Read16(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr);
 	else
-		val = NDS::ARM9Read8(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr);
 
 	if (std::is_same<T, u32>::value)
 		return ROR(val, offset << 3);
@@ -153,7 +172,7 @@ T SlowRead9(u32 addr, ARMv5* cpu)
 		return val;
 }
 
-template <typename T>
+template <typename T, int ConsoleType>
 void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 {
 	addr &= ~(sizeof(T) - 1);
@@ -169,27 +188,19 @@ void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 	}
 	else if (std::is_same<T, u32>::value)
 	{
-		NDS::ARM9Write32(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val);
 	}
 	else if (std::is_same<T, u16>::value)
 	{
-		NDS::ARM9Write16(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val);
 	}
 	else
 	{
-		NDS::ARM9Write8(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val);
 	}
 }
 
-template void SlowWrite9<u32>(u32, ARMv5*, u32);
-template void SlowWrite9<u16>(u32, ARMv5*, u16);
-template void SlowWrite9<u8>(u32, ARMv5*, u8);
-
-template u32 SlowRead9<u32>(u32, ARMv5*);
-template u16 SlowRead9<u16>(u32, ARMv5*);
-template u8 SlowRead9<u8>(u32, ARMv5*);
-
-template <typename T>
+template <typename T, int ConsoleType>
 T SlowRead7(u32 addr)
 {
 	u32 offset = addr & 0x3;
@@ -197,11 +208,11 @@ T SlowRead7(u32 addr)
 
 	T val;
 	if (std::is_same<T, u32>::value)
-		val = NDS::ARM7Read32(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr);
 	else if (std::is_same<T, u16>::value)
-		val = NDS::ARM7Read16(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr);
 	else
-		val = NDS::ARM7Read8(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr);
 
 	if (std::is_same<T, u32>::value)
 		return ROR(val, offset << 3);
@@ -209,67 +220,71 @@ T SlowRead7(u32 addr)
 		return val;
 }
 
-template <typename T>
+template <typename T, int ConsoleType>
 void SlowWrite7(u32 addr, T val)
 {
 	addr &= ~(sizeof(T) - 1);
 
 	if (std::is_same<T, u32>::value)
-		NDS::ARM7Write32(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val);
 	else if (std::is_same<T, u16>::value)
-		NDS::ARM7Write16(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val);
 	else
-		NDS::ARM7Write8(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val);
 }
 
-template <bool PreInc, bool Write>
+template <bool Write, int ConsoleType>
 void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
 {
 	addr &= ~0x3;
-	if (PreInc)
-		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
 		if (Write)
-			SlowWrite9<u32>(addr, cpu, data[i]);
+			SlowWrite9<u32, ConsoleType>(addr, cpu, data[i]);
 		else
-			data[i] = SlowRead9<u32>(addr, cpu);
+			data[i] = SlowRead9<u32, ConsoleType>(addr, cpu);
 		addr += 4;
 	}
 }
 
-template <bool PreInc, bool Write>
+template <bool Write, int ConsoleType>
 void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
 {
 	addr &= ~0x3;
-	if (PreInc)
-		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
 		if (Write)
-			SlowWrite7<u32>(addr, data[i]);
+			SlowWrite7<u32, ConsoleType>(addr, data[i]);
 		else
-			data[i] = SlowRead7<u32>(addr);
+			data[i] = SlowRead7<u32, ConsoleType>(addr);
 		addr += 4;
 	}
 }
 
-template void SlowWrite7<u32>(u32, u32);
-template void SlowWrite7<u16>(u32, u16);
-template void SlowWrite7<u8>(u32, u8);
-
-template u32 SlowRead7<u32>(u32);
-template u16 SlowRead7<u16>(u32);
-template u8 SlowRead7<u8>(u32);
-
-template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num);
+#define INSTANTIATE_SLOWMEM(consoleType) \
+	template void SlowWrite9<u32, consoleType>(u32, ARMv5*, u32); \
+	template void SlowWrite9<u16, consoleType>(u32, ARMv5*, u16); \
+	template void SlowWrite9<u8, consoleType>(u32, ARMv5*, u8); \
+	\
+	template u32 SlowRead9<u32, consoleType>(u32, ARMv5*); \
+	template u16 SlowRead9<u16, consoleType>(u32, ARMv5*); \
+	template u8 SlowRead9<u8, consoleType>(u32, ARMv5*); \
+	\
+	template void SlowWrite7<u32, consoleType>(u32, u32); \
+	template void SlowWrite7<u16, consoleType>(u32, u16); \
+	template void SlowWrite7<u8, consoleType>(u32, u8); \
+	\
+	template u32 SlowRead7<u32, consoleType>(u32); \
+	template u16 SlowRead7<u16, consoleType>(u32); \
+	template u8 SlowRead7<u8, consoleType>(u32); \
+	\
+	template void SlowBlockTransfer9<false, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer9<true, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer7<false, consoleType>(u32 addr, u64* data, u32 num); \
+	template void SlowBlockTransfer7<true, consoleType>(u32 addr, u64* data, u32 num); \
+
+INSTANTIATE_SLOWMEM(0)
+INSTANTIATE_SLOWMEM(1)
 
 template <typename K, typename V, int Size, V InvalidValue>
 struct UnreliableHashTable
@@ -616,6 +631,12 @@ void CompileBlock(ARM* cpu)
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
 
+	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
+	if (!localAddr)
+	{
+		printf("trying to compile non executable code? %x\n", blockAddr);
+	}
+
 	auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7;
 	auto existingBlockIt = map.find(blockAddr);
 	if (existingBlockIt != map.end())
@@ -623,18 +644,24 @@ void CompileBlock(ARM* cpu)
 		// there's already a block, though it's not inside the fast map
 		// could be that there are two blocks at the same physical addr
 		// but different mirrors
-		u32 localAddr = existingBlockIt->second->StartAddrLocal;
+		u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal;
 
-		u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF];
-		*entry = ((u64)blockAddr | cpu->Num) << 32;
-		*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
-		return;
-	}
+		if (localAddr == otherLocalAddr)
+		{
+			JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr);
 
-	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
-	if (!localAddr)
-	{
-		printf("trying to compile non executable code? %x\n", blockAddr);
+			u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2];
+			*entry = ((u64)blockAddr | cpu->Num) << 32;
+			*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
+			return;
+		}
+
+		// some memory has been remapped
+		JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second);
+		if (prevBlock)
+			delete prevBlock;
+		
+		map.erase(existingBlockIt);
 	}
 
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
@@ -655,7 +682,7 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr);
+	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr);
 
 	u32 lastSegmentStart = blockAddr;
 	u32 lr;
@@ -678,7 +705,7 @@ void CompileBlock(ARM* cpu)
 		instrValues[i] = instrs[i].Instr;
 
 		u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr);
-		assert(translatedAddr);
+		assert(translatedAddr >> 27);
 		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
@@ -727,7 +754,10 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
-		if (instrs[i].Info.DstRegs & (1 << 14))
+		if (instrs[i].Info.DstRegs & (1 << 14)
+			|| (!thumb
+				&& (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG)
+				&& instrs[i].Instr & (1 << 16)))
 			hasLink = false;
 
 		if (thumb)
@@ -792,7 +822,7 @@ void CompileBlock(ARM* cpu)
 			i--;
 		}
 
-		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
@@ -830,8 +860,6 @@ void CompileBlock(ARM* cpu)
 				}
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
-					u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target);
-
 					if (link)
 					{
 						lr = linkAddr;
@@ -927,6 +955,8 @@ void CompileBlock(ARM* cpu)
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
 		block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i);
+
+		JIT_DEBUGPRINT("block start %p\n", block->EntryPoint);
 	}
 	else
 	{
@@ -940,12 +970,12 @@ void CompileBlock(ARM* cpu)
 		assert(addressMasks[j] == block->AddressMasks()[j]);
 		assert(addressMasks[j] != 0);
 
-		AddressRange* region = CodeMemRegions[addressRanges[j] >> 28];
+		AddressRange* region = CodeMemRegions[addressRanges[j] >> 27];
 
-		if (!PageContainsCode(&region[(addressRanges[j] & 0xFFFF000) / 512]))
-			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true);
+		if (!PageContainsCode(&region[(addressRanges[j] & 0x7FFF000) / 512]))
+			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true);
 
-		AddressRange* range = &region[(addressRanges[j] & 0xFFFFFFF) / 512];
+		AddressRange* range = &region[(addressRanges[j] & 0x7FFFFFF) / 512];
 		range->Code |= addressMasks[j];
 		range->Blocks.Add(block);
 	}
@@ -955,7 +985,7 @@ void CompileBlock(ARM* cpu)
 	else
 		JitBlocks7[blockAddr] = block;
 
-	u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2];
+	u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2];
 	*entry = ((u64)blockAddr | cpu->Num) << 32;
 	*entry |= JITCompiler->SubEntryOffset(block->EntryPoint);
 }
@@ -964,8 +994,8 @@ void InvalidateByAddr(u32 localAddr)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr);
 
-	AddressRange* region = CodeMemRegions[localAddr >> 28];
-	AddressRange* range = &region[(localAddr & 0xFFFFFFF) / 512];
+	AddressRange* region = CodeMemRegions[localAddr >> 27];
+	AddressRange* range = &region[(localAddr & 0x7FFFFFF) / 512];
 	u32 mask = 1 << ((localAddr & 0x1FF) / 16);
 
 	range->Code = 0;
@@ -994,9 +1024,9 @@ void InvalidateByAddr(u32 localAddr)
 		range->Blocks.Remove(i);
 
 		if (range->Blocks.Length == 0
-			&& !PageContainsCode(&region[(localAddr & 0xFFFF000) / 512]))
+			&& !PageContainsCode(&region[(localAddr & 0x7FFF000) / 512]))
 		{
-			ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false);
+			ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false);
 		}
 
 		bool literalInvalidation = false;
@@ -1019,8 +1049,8 @@ void InvalidateByAddr(u32 localAddr)
 			u32 addr = block->AddressRanges()[j];
 			if ((addr / 512) != (localAddr / 512))
 			{
-				AddressRange* otherRegion = CodeMemRegions[addr >> 28];
-				AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512];
+				AddressRange* otherRegion = CodeMemRegions[addr >> 27];
+				AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512];
 				assert(otherRange != range);
 
 				bool removed = otherRange->Blocks.RemoveByValue(block);
@@ -1028,15 +1058,15 @@ void InvalidateByAddr(u32 localAddr)
 
 				if (otherRange->Blocks.Length == 0)
 				{
-					if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512]))
-						ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false);
+					if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512]))
+						ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false);
 
 					otherRange->Code = 0;
 				}
 			}
 		}
 
-		FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32;
+		FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32;
 		if (block->Num == 0)
 			JitBlocks9.erase(block->StartAddr);
 		else
@@ -1055,19 +1085,25 @@ void InvalidateByAddr(u32 localAddr)
 	}
 }
 
-template <u32 num, int region>
-void CheckAndInvalidate(u32 addr)
+void CheckAndInvalidateITCM()
 {
-	// let's hope this gets all properly inlined
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize))
+	for (u32 i = 0; i < ITCMPhysicalSize; i+=16)
 	{
-		u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
-		if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
-			InvalidateByAddr(localAddr | (region << 28));
+		if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16)))
+		{
+			InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27));
+		}
 	}
 }
 
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr)
+{
+	u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr);
+	if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
+		InvalidateByAddr(localAddr);
+}
+
 JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
 {
 	u64* entry = &entries[offset / 2];
@@ -1076,35 +1112,44 @@ JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
 	return NULL;
 }
 
+void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry)
+{
+	u32 localAddr = LocaliseCodeAddress(num, blockAddr);
+	assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry);
+}
+
 bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size)
 {
+	// amazingly ignoring the DTCM is the proper behaviour for code fetches
 	int region = num == 0
 		? ARMJIT_Memory::ClassifyAddress9(blockAddr)
 		: ARMJIT_Memory::ClassifyAddress7(blockAddr);
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (CodeMemRegions[region]
-		&& ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
-			mappingSize, memoryOffset, memorySize))
+	u32 memoryOffset;
+	if (FastBlockLookupRegions[region]
+		&& ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size))
 	{
+		//printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset);
 		entry = FastBlockLookupRegions[region] + memoryOffset / 2;
-		// evil, though it should work for everything except DTCM which is not relevant here
-		start = blockAddr & ~(memorySize - 1);
-		size = memorySize;
 		return true;
 	}
-	else
-		return false;
+	return false;
 }
 
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32);
-template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32);
-template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32);
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32);
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
 
 void ResetBlockCache()
 {
@@ -1133,7 +1178,7 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
 			range->Blocks.Clear();
 			range->Code = 0;
 		}
@@ -1145,7 +1190,7 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
 			range->Blocks.Clear();
 			range->Code = 0;
 		}
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 2320b7b..04add59 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -16,6 +16,8 @@ void DeInit();
 
 void Reset();
 
+void CheckAndInvalidateITCM();
+
 void InvalidateByAddr(u32 pseudoPhysical);
 
 template <u32 num, int region>
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index b307d0e..c1b23a7 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -168,7 +168,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
         ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
         : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
 
-    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget)))
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
     {
         ptrdiff_t memopStart = GetCodeOffset();
         LoadStorePatch patch;
@@ -461,7 +461,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
     bool compileFastPath = Config::JIT_FastMemory
-        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget));
+        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
 
     if (decrement)
     {
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 19684c4..c87e1b3 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -214,13 +214,13 @@ u32 LocaliseCodeAddress(u32 num, u32 addr);
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset);
 
-template <typename T> T SlowRead9(u32 addr, ARMv5* cpu);
-template <typename T> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
-template <typename T> T SlowRead7(u32 addr);
-template <typename T> void SlowWrite7(u32 addr, T val);
+template <typename T, int ConsoleType> T SlowRead9(u32 addr, ARMv5* cpu);
+template <typename T, int ConsoleType> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
+template <typename T, int ConsoleType> T SlowRead7(u32 addr);
+template <typename T, int ConsoleType> void SlowWrite7(u32 addr, T val);
 
-template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
-template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
+template <bool Write, int ConsoleType> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
+template <bool Write, int ConsoleType> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
 
 }
 
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
index 162827d..0276c65 100644
--- a/src/ARMJIT_Memory.cpp
+++ b/src/ARMJIT_Memory.cpp
@@ -1,5 +1,7 @@
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 #include "switch/compat_switch.h"
+#elif defined(_WIN32)
+#include <windows.h>
 #endif
 
 #include "ARMJIT_Memory.h"
@@ -7,6 +9,7 @@
 #include "ARMJIT_Internal.h"
 #include "ARMJIT_Compiler.h"
 
+#include "DSi.h"
 #include "GPU.h"
 #include "GPU3D.h"
 #include "Wifi.h"
@@ -37,66 +40,24 @@
 
 namespace ARMJIT_Memory
 {
-#ifdef __aarch64__
-struct FaultDescription
-{
-	u64 IntegerRegisters[33];
-	u64 FaultAddr;
-
-	u32 GetEmulatedAddr()
-	{
-		// now this is podracing
-		return (u32)IntegerRegisters[0];
-	}
-	u64 RealAddr()
-	{
-		return FaultAddr;
-	}
-
-	u64 GetPC()
-	{
-		return IntegerRegisters[32];
-	}
-
-	void RestoreAndRepeat(s64 offset);
-};
-#else
 struct FaultDescription
 {
-	u64 GetPC()
-	{
-		return 0;
-	}
-	
-	u32 GetEmulatedAddr()
-	{
-		return 0;
-	}
-	u64 RealAddr()
-	{
-		return 0;
-	}
-
-	void RestoreAndRepeat(s64 offset);
+	u32 EmulatedFaultAddr;
+	u64 FaultPC;
 };
-#endif
 
-void FaultHandler(FaultDescription* faultDesc);
+bool FaultHandler(FaultDescription* faultDesc, s32& offset);
 }
 
-
-#ifdef __aarch64__
-
-extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
-
-#endif
-
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 // with LTO the symbols seem to be not properly overriden
 // if they're somewhere else
 
 extern "C"
 {
+	
+void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
+
 extern char __start__;
 extern char __rodata_start;
 
@@ -106,57 +67,85 @@ u64 __nx_exception_stack_size = 0x8000;
 void __libnx_exception_handler(ThreadExceptionDump* ctx)
 {
 	ARMJIT_Memory::FaultDescription desc;
-	memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29);
-	desc.IntegerRegisters[29] = ctx->fp.x;
-	desc.IntegerRegisters[30] = ctx->lr.x;
-	desc.IntegerRegisters[31] = ctx->sp.x;
-	desc.IntegerRegisters[32] = ctx->pc.x;
+	desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w;
+	desc.FaultPC = ctx->pc.x;
+
+	u64 integerRegisters[33];
+	memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29);
+	integerRegisters[29] = ctx->fp.x;
+	integerRegisters[30] = ctx->lr.x;
+	integerRegisters[31] = ctx->sp.x;
+	integerRegisters[32] = ctx->pc.x;
+
+	s32 offset;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		integerRegisters[32] += offset;
 
-	ARMJIT_Memory::FaultHandler(&desc);
+		ARM_RestoreContext(integerRegisters);	
+	}
 
 	if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start)
 	{
-		printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
+		printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
 			ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x);
 	}
 	else
 	{
-		printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
+		printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
 	}
 }
 
 }
+
+#elif defined(_WIN32)
+
+static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo)
+{
+	if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION)
+		return EXCEPTION_CONTINUE_SEARCH;
+
+	ARMJIT_Memory::FaultDescription desc;
+	desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx;
+	desc.FaultPC = exceptionInfo->ContextRecord->Rip;
+
+	s32 offset = 0;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		exceptionInfo->ContextRecord->Rip += offset;
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+
+	return EXCEPTION_CONTINUE_SEARCH;
+}
+
 #endif
 
 namespace ARMJIT_Memory
 {
 
-#ifdef __aarch64__
-void FaultDescription::RestoreAndRepeat(s64 offset)
-{
-	IntegerRegisters[32] += offset;
+void* FastMem9Start, *FastMem7Start;
 
-	ARM_RestoreContext(IntegerRegisters);
+#ifdef _WIN32
+inline u32 RoundUp(u32 size)
+{
+	return (size + 0xFFFF) & ~0xFFFF;
 }
 #else
-void FaultDescription::RestoreAndRepeat(s64 offset)
+inline u32 RoundUp(u32 size)
 {
-	
+	return size;
 }
 #endif
 
-void* FastMem9Start, *FastMem7Start;
-
-const u32 MemoryTotalSize =
-	NDS::MainRAMSize
-	+ NDS::SharedWRAMSize
-	+ NDS::ARM7WRAMSize
-	+ DTCMPhysicalSize;
-
 const u32 MemBlockMainRAMOffset = 0;
-const u32 MemBlockSWRAMOffset = NDS::MainRAMSize;
-const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize;
-const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize;
+const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize);
+const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize);
+const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize);
+const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize);
+const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize);
 
 const u32 OffsetsPerRegion[memregions_Count] =
 {
@@ -173,6 +162,11 @@ const u32 OffsetsPerRegion[memregions_Count] =
 	UINT32_MAX,
 	UINT32_MAX,
 	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockNWRAM_AOffset,
+	MemBlockNWRAM_BOffset,
+	MemBlockNWRAM_COffset
 };
 
 enum
@@ -186,11 +180,13 @@ enum
 u8 MappingStatus9[1 << (32-12)];
 u8 MappingStatus7[1 << (32-12)];
 
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 u8* MemoryBase;
 u8* MemoryBaseCodeMem;
-#else
+#elif defined(_WIN32)
 u8* MemoryBase;
+HANDLE MemoryFile;
+LPVOID ExceptionHandlerHandle;
 #endif
 
 bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
@@ -200,6 +196,9 @@ bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
 	Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), 
 		(u64)(MemoryBaseCodeMem + offset), size));
 	return R_SUCCEEDED(r);
+#elif defined(_WIN32)
+	bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst;
+	return r;
 #endif
 }
 
@@ -209,8 +208,24 @@ bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size)
 #ifdef __SWITCH__
 	Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(),
 		(u64)(MemoryBaseCodeMem + offset), size);
-	printf("%x\n", r);
 	return R_SUCCEEDED(r);
+#else
+	return UnmapViewOfFile(dst);
+#endif
+}
+
+void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#if defined(_WIN32)
+	DWORD winProtection, oldProtection;
+	if (protection == 0)
+		winProtection = PAGE_NOACCESS;
+	else if (protection == 1)
+		winProtection = PAGE_READONLY;
+	else
+		winProtection = PAGE_READWRITE;
+	VirtualProtect(dst, size, winProtection, &oldProtection);
 #endif
 }
 
@@ -230,7 +245,6 @@ struct Mapping
 			if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase)
 			{
 				offset += NDS::ARM9->DTCMSize;
-				printf("%x skip\n", NDS::ARM9->DTCMSize);
 			}
 			else
 			{
@@ -245,6 +259,7 @@ struct Mapping
 					offset += 0x1000;
 				}
 
+#ifdef __SWITCH__
 				if (status == memstate_MappedRW)
 				{
 					u32 segmentSize = offset - segmentOffset;
@@ -252,8 +267,12 @@ struct Mapping
 					bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
 					assert(success);
 				}
+#endif
 			}
 		}
+#if defined(_WIN32)
+		UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size);
+#endif
 	}
 };
 ARMJIT::TinyVector<Mapping> Mappings[memregions_Count];
@@ -268,6 +287,8 @@ void SetCodeProtection(int region, u32 offset, bool protect)
 		Mapping& mapping = Mappings[region][i];
 
 		u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset);
+		if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size)
+			continue;
 		if (mapping.Num == 0
 			&& region != memregion_DTCM 
 			&& effectiveAddr >= NDS::ARM9->DTCMBase
@@ -276,16 +297,20 @@ void SetCodeProtection(int region, u32 offset, bool protect)
 
 		u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7);
 
-		printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num);
+		printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]);
 		assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected));
 		states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW;
 
+#if defined(__SWITCH__)
 		bool success;
 		if (protect)
 			success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
 		else
 			success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
 		assert(success);
+#elif defined(_WIN32)
+		SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2);
+#endif
 	}
 }
 
@@ -314,8 +339,8 @@ void RemapDTCM(u32 newBase, u32 newSize)
 
 			printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset);
 
-			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end));
-			bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end));
+			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start);
+			bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start);
 
 			if (mapping.Num == 0 && (oldOverlap || newOverlap))
 			{
@@ -336,19 +361,50 @@ void RemapDTCM(u32 newBase, u32 newSize)
 	Mappings[memregion_DTCM].Clear();
 }
 
+void RemapNWRAM(int num)
+{
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;)
+	{
+		Mapping& mapping = Mappings[memregion_SharedWRAM][i];
+		if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size
+			|| DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr))
+		{
+			mapping.Unmap(memregion_SharedWRAM);
+			Mappings[memregion_SharedWRAM].Remove(i);
+		}
+		else
+		{
+			i++;
+		}
+	}
+	for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++)
+	{
+		Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num);
+	}
+	Mappings[memregion_NewSharedWRAM_A + num].Clear();
+}
+
 void RemapSWRAM()
 {
 	printf("remapping SWRAM\n");
-	for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++)
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++)
 	{
-		Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM);
+		Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM);
 	}
-	Mappings[memregion_SWRAM].Clear();
+	Mappings[memregion_SharedWRAM].Clear();
 	for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++)
 	{
 		Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7);
 	}
 	Mappings[memregion_WRAM7].Clear();
+	for (int j = 0; j < 3; j++)
+	{
+		for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++)
+		{
+			Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j);	
+		}
+		Mappings[memregion_NewSharedWRAM_A + j].Clear();
+	}
 }
 
 bool MapAtAddress(u32 addr)
@@ -359,33 +415,36 @@ bool MapAtAddress(u32 addr)
 		? ClassifyAddress9(addr)
 		: ClassifyAddress7(addr);
 
-	if (!IsMappable(region))
+	if (!IsFastmemCompatible(region))
 		return false;
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize);
+	return false;
 
+	u32 mirrorStart, mirrorSize, memoryOffset;
+	bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize);
 	if (!isMapped)
 		return false;
 
-	// this calculation even works with DTCM
-	// which doesn't have to be aligned to it's own size
-	u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart;
-
 	u8* states = num == 0 ? MappingStatus9 : MappingStatus7;
-	printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset);
+	printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num);
 	bool isExecutable = ARMJIT::CodeMemRegions[region];
 
-	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset;
+#if defined(_WIN32)
+	bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize);
+	assert(succeded);
+#endif
+
+	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512;
 
 	// this overcomplicated piece of code basically just finds whole pieces of code memory
 	// which can be mapped
 	u32 offset = 0;	
 	bool skipDTCM = num == 0 && region != memregion_DTCM;
-	while (offset < memorySize)
+	while (offset < mirrorSize)
 	{
 		if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase)
 		{
+			SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0);
 			offset += NDS::ARM9->DTCMSize;
 		}
 		else
@@ -393,7 +452,7 @@ bool MapAtAddress(u32 addr)
 			u32 sectionOffset = offset;
 			bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]);
 			while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode)
-				&& offset < memorySize
+				&& offset < mirrorSize
 				&& (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase))
 			{
 				assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped);
@@ -403,41 +462,49 @@ bool MapAtAddress(u32 addr)
 
 			u32 sectionSize = offset - sectionOffset;
 
+#if defined(__SWITCH__)
 			if (!hasCode)
 			{
 				printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]);
 				bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize);
 				assert(succeded);
 			}
+#elif defined(_WIN32)
+			if (hasCode)
+			{
+				SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1);
+			}
+#endif
 		}
 	}
 
-	Mapping mapping{mirrorStart, memorySize, memoryOffset, num};
+	assert(num == 0 || num == 1);
+	Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num};
 	Mappings[region].Add(mapping);
 
-	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1);
+	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1);
 
 	return true;
 }
 
-void FaultHandler(FaultDescription* faultDesc)
+bool FaultHandler(FaultDescription* faultDesc, s32& offset)
 {
-	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC()))
+	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC))
 	{
 		bool rewriteToSlowPath = true;
 
-		u32 addr = faultDesc->GetEmulatedAddr();
+		u32 addr = faultDesc->EmulatedFaultAddr;
 
 		if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped)
-			rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr());
+			rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr);
 
-		s64 offset = 0;
 		if (rewriteToSlowPath)
 		{
-			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC());
+			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC);
 		}
-		faultDesc->RestoreAndRepeat(offset);
+		return true;
 	}
+	return false;
 }
 
 void Init()
@@ -459,18 +526,34 @@ void Init()
 	FastMem7Start = virtmemReserve(0x100000000);
 	assert(FastMem7Start);
 
-	NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset;
-	NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset;
-	NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset;
-	NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset;
-#else
-	MemoryBase = new u8[MemoryTotalSize];
+	u8* basePtr = MemoryBaseCodeMem;
+#elif defined(_WIN32)
+	ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler);
+
+	MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL);
 
-	NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset;
-	NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset;
-	NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset;
-	NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset;
+	MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE);
+
+	FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+	FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+
+	// only free them after they have all been reserved
+	// so they can't overlap
+	VirtualFree(MemoryBase, 0, MEM_RELEASE);
+	VirtualFree(FastMem9Start, 0, MEM_RELEASE);
+	VirtualFree(FastMem7Start, 0, MEM_RELEASE);
+
+	MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase);
+
+	u8* basePtr = MemoryBase;
 #endif
+	NDS::MainRAM = basePtr + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset;
+	DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset;
+	DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset;
+	DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset;
 }
 
 void DeInit()
@@ -482,8 +565,11 @@ void DeInit()
     svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize);
 	virtmemFree(MemoryBaseCodeMem, MemoryTotalSize);
     free(MemoryBase);
-#else
-	delete[] MemoryBase;
+#elif defined(_WIN32)
+	assert(UnmapViewOfFile(MemoryBase));
+	CloseHandle(MemoryFile);
+
+	RemoveVectoredExceptionHandler(ExceptionHandlerHandle);
 #endif
 }
 
@@ -505,12 +591,23 @@ void Reset()
 	printf("done resetting jit mem\n");
 }
 
-bool IsMappable(int region)
+bool IsFastmemCompatible(int region)
 {
+#ifdef _WIN32
+	/*
+		TODO: with some hacks, the smaller shared WRAM regions
+		could be mapped in some occaisons as well
+	*/
+	if (region == memregion_DTCM 
+		|| region == memregion_SharedWRAM
+		|| region == memregion_NewSharedWRAM_B
+		|| region == memregion_NewSharedWRAM_C)
+		return false;
+#endif
 	return OffsetsPerRegion[region] != UINT32_MAX;
 }
 
-bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize)
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize)
 {
 	memoryOffset = 0;
 	switch (region)
@@ -518,137 +615,251 @@ bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize,
 	case memregion_ITCM:
 		if (num == 0)
 		{
-			mappingStart = 0;
-			mappingSize = NDS::ARM9->ITCMSize;
-			memorySize = ITCMPhysicalSize;
+			mirrorStart = addr & ~(ITCMPhysicalSize - 1);
+			mirrorSize = ITCMPhysicalSize;
 			return true;
 		}
 		return false;
-	case memregion_DTCM:
+	case memregion_MainRAM:
+		mirrorStart = addr & ~NDS::MainRAMMask;
+		mirrorSize = NDS::MainRAMMask + 1;
+		return true;
+	case memregion_BIOS9:
 		if (num == 0)
 		{
-			mappingStart = NDS::ARM9->DTCMBase;
-			mappingSize = NDS::ARM9->DTCMSize;
-			memorySize = DTCMPhysicalSize;
+			mirrorStart = addr & ~0xFFF;
+			mirrorSize = 0x1000;
 			return true;
 		}
 		return false;
-	case memregion_BIOS9:
-		if (num == 0)
+	case memregion_BIOS7:
+		if (num == 1)
 		{
-			mappingStart = 0xFFFF0000;
-			mappingSize = 0x10000;
-			memorySize = 0x1000;
+			mirrorStart = 0;
+			mirrorSize = 0x4000;
 			return true;
 		}
 		return false;
-	case memregion_MainRAM:
-		mappingStart = 0x2000000;
-		mappingSize = 0x1000000;
-		memorySize = NDS::MainRAMSize;
-		return true;
-	case memregion_SWRAM:
-		mappingStart = 0x3000000;
+	case memregion_SharedWRAM:
 		if (num == 0 && NDS::SWRAM_ARM9.Mem)
 		{
-			mappingSize = 0x1000000;
+			mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask;
+			mirrorSize = NDS::SWRAM_ARM9.Mask + 1;
 			memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM;
-			memorySize = NDS::SWRAM_ARM9.Mask + 1;
 			return true;
 		}
 		else if (num == 1 && NDS::SWRAM_ARM7.Mem)
 		{
-			mappingSize = 0x800000;
+			mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask;
+			mirrorSize = NDS::SWRAM_ARM7.Mask + 1;
 			memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM;
-			memorySize = NDS::SWRAM_ARM7.Mask + 1;
+			return true;
+		}
+		return false;
+	case memregion_WRAM7:
+		if (num == 1)
+		{
+			mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1);
+			mirrorSize = NDS::ARM7WRAMSize;
 			return true;
 		}
 		return false;
 	case memregion_VRAM:
 		if (num == 0)
 		{
-			// this is a gross simplification
-			// mostly to make code on vram working
-			// it doesn't take any of the actual VRAM mappings into account
-			mappingStart = 0x6000000;
-			mappingSize = 0x1000000;
-			memorySize = 0x100000;
-			return true;
+			mirrorStart = addr & ~0xFFFFF;
+			mirrorSize = 0x100000;
 		}
 		return false;
-	case memregion_BIOS7:
+	case memregion_VWRAM:
 		if (num == 1)
 		{
-			mappingStart = 0;
-			mappingSize = 0x4000;
-			memorySize = 0x4000;
+			mirrorStart = addr & ~0x3FFFF;
+			mirrorSize = 0x40000;
 			return true;
 		}
 		return false;
-	case memregion_WRAM7:
-		if (num == 1)
+	case memregion_NewSharedWRAM_A:
 		{
-			if (NDS::SWRAM_ARM7.Mem)
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
 			{
-				mappingStart = 0x3800000;
-				mappingSize = 0x800000;
+				memoryOffset = ptr - DSi::NWRAM_A;
+				mirrorStart = addr & ~0xFFFF;
+				mirrorSize = 0x10000;
+				return true;
 			}
-			else
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
 			{
-				mappingStart = 0x3000000;
-				mappingSize = 0x1000000;
+				memoryOffset = ptr - DSi::NWRAM_B;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
 			}
-			memorySize = NDS::ARM7WRAMSize;
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+			{
+				memoryOffset = ptr - DSi::NWRAM_C;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
+			}
+			return false; // zero filled memory
+		}
+	case memregion_BIOS9DSi:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000;
 			return true;
 		}
 		return false;
-	case memregion_VWRAM:
+	case memregion_BIOS7DSi:
 		if (num == 1)
 		{
-			mappingStart = 0x6000000;
-			mappingSize = 0x1000000;
-			memorySize = 0x20000;
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000;
 			return true;
 		}
 		return false;
 	default:
-		// for the JIT we don't are about the rest
+		assert(false && "For the time being this should only be used for code");
 		return false;
 	}
 }
 
+u32 LocaliseAddress(int region, u32 num, u32 addr)
+{
+	switch (region)
+	{
+	case memregion_ITCM:
+		return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27);
+	case memregion_MainRAM:
+		return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27);
+	case memregion_BIOS9:
+		return (addr & 0xFFF) | (memregion_BIOS9 << 27);
+	case memregion_BIOS7:
+		return (addr & 0x3FFF) | (memregion_BIOS7 << 27);
+	case memregion_SharedWRAM:
+		if (num == 0)
+			return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+		else
+			return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+	case memregion_WRAM7:
+		return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27);
+	case memregion_VRAM:
+		// TODO: take mapping properly into account
+		return (addr & 0xFFFFF) | (memregion_VRAM << 27);
+	case memregion_VWRAM:
+		// same here
+		return (addr & 0x3FFFF) | (memregion_VWRAM << 27);
+	case memregion_NewSharedWRAM_A:
+		{
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27);
+			else
+				return memregion_Other << 27; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_BIOS9DSi:
+	case memregion_BIOS7DSi:
+		return (addr & 0xFFFF) | (region << 27);
+	default:
+		assert(false && "This should only be needed for regions which can contain code");
+		return memregion_Other << 27;
+	}
+}
+
 int ClassifyAddress9(u32 addr)
 {
 	if (addr < NDS::ARM9->ITCMSize)
+	{
 		return memregion_ITCM;
+	}
 	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+	{
 		return memregion_DTCM;
-	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
-		return memregion_BIOS9;
-	else
+	}
+	else 
 	{
+		if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1)))
+		{
+			if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0)))
+				return memregion_Other;
+
+			return memregion_BIOS9DSi;
+		}
+		else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		{
+			return memregion_BIOS9;
+		}
+
 		switch (addr & 0xFF000000)
 		{
 		case 0x02000000:
 			return memregion_MainRAM;
 		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
 			if (NDS::SWRAM_ARM9.Mem)
-				return memregion_SWRAM;
-			else
-				return memregion_Other;
+				return memregion_SharedWRAM;
+			return memregion_Other;
 		case 0x04000000:
 			return memregion_IO9;
 		case 0x06000000:
 			return memregion_VRAM;
+		default:
+			return memregion_Other;
 		}
 	}
-	return memregion_Other;
 }
 
 int ClassifyAddress7(u32 addr)
 {
-	if (addr < 0x00004000)
+	if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9)))
+    {
+        if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8))
+            return memregion_Other;
+
+        return memregion_BIOS7DSi;
+    }
+	else if (addr < 0x00004000)
+	{
 		return memregion_BIOS7;
+	}
 	else
 	{
 		switch (addr & 0xFF800000)
@@ -657,10 +868,19 @@ int ClassifyAddress7(u32 addr)
 		case 0x02800000:
 			return memregion_MainRAM;
 		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
 			if (NDS::SWRAM_ARM7.Mem)
-				return memregion_SWRAM;
-			else
-				return memregion_WRAM7;
+				return memregion_SharedWRAM;
+			return memregion_WRAM7;
 		case 0x03800000:
 			return memregion_WRAM7;
 		case 0x04000000:
@@ -740,14 +960,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 				}
 			}
 
-			switch (size | store)
+			if (NDS::ConsoleType == 0)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM9IORead8;
+				case 9: return (void*)NDS::ARM9IOWrite8;
+				case 16: return (void*)NDS::ARM9IORead16;
+				case 17: return (void*)NDS::ARM9IOWrite16;
+				case 32: return (void*)NDS::ARM9IORead32;
+				case 33: return (void*)NDS::ARM9IOWrite32;
+				}
+			}
+			else
 			{
-			case 8: return (void*)NDS::ARM9IORead8;
-			case 9: return (void*)NDS::ARM9IOWrite8;
-			case 16: return (void*)NDS::ARM9IORead16;
-			case 17: return (void*)NDS::ARM9IOWrite16;
-			case 32: return (void*)NDS::ARM9IORead32;
-			case 33: return (void*)NDS::ARM9IOWrite32;
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM9IORead8;
+				case 9: return (void*)DSi::ARM9IOWrite8;
+				case 16: return (void*)DSi::ARM9IORead16;
+				case 17: return (void*)DSi::ARM9IOWrite16;
+				case 32: return (void*)DSi::ARM9IORead32;
+				case 33: return (void*)DSi::ARM9IOWrite32;
+				}
 			}
 			break;
 		case 0x06000000:
@@ -781,14 +1016,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 				}
 			}
 
-			switch (size | store)
+			if (NDS::ConsoleType == 0)
 			{
-			case 8: return (void*)NDS::ARM7IORead8;
-			case 9: return (void*)NDS::ARM7IOWrite8;		
-			case 16: return (void*)NDS::ARM7IORead16;
-			case 17: return (void*)NDS::ARM7IOWrite16;
-			case 32: return (void*)NDS::ARM7IORead32;
-			case 33: return (void*)NDS::ARM7IOWrite32;
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM7IORead8;
+				case 9: return (void*)NDS::ARM7IOWrite8;		
+				case 16: return (void*)NDS::ARM7IORead16;
+				case 17: return (void*)NDS::ARM7IOWrite16;
+				case 32: return (void*)NDS::ARM7IORead32;
+				case 33: return (void*)NDS::ARM7IOWrite32;
+				}
+			}
+			else
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM7IORead8;
+				case 9: return (void*)DSi::ARM7IOWrite8;		
+				case 16: return (void*)DSi::ARM7IORead16;
+				case 17: return (void*)DSi::ARM7IOWrite16;
+				case 32: return (void*)DSi::ARM7IORead32;
+				case 33: return (void*)DSi::ARM7IOWrite32;
+				}
 			}
 			break;
 		case 0x04800000:
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
index 1a59d98..123e18e 100644
--- a/src/ARMJIT_Memory.h
+++ b/src/ARMJIT_Memory.h
@@ -23,7 +23,7 @@ enum
 	memregion_DTCM,
 	memregion_BIOS9,
 	memregion_MainRAM,
-	memregion_SWRAM,
+	memregion_SharedWRAM,
 	memregion_IO9,
 	memregion_VRAM,
 	memregion_BIOS7,
@@ -31,18 +31,28 @@ enum
 	memregion_IO7,
 	memregion_Wifi,
 	memregion_VWRAM,
+
+	// DSi
+	memregion_BIOS9DSi,
+	memregion_BIOS7DSi,
+	memregion_NewSharedWRAM_A,
+	memregion_NewSharedWRAM_B,
+	memregion_NewSharedWRAM_C,
+
 	memregions_Count
 };
 
 int ClassifyAddress9(u32 addr);
 int ClassifyAddress7(u32 addr);
 
-bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize);
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize);
+u32 LocaliseAddress(int region, u32 num, u32 addr);
 
-bool IsMappable(int region);
+bool IsFastmemCompatible(int region);
 
 void RemapDTCM(u32 newBase, u32 newSize);
 void RemapSWRAM();
+void RemapNWRAM(int num);
 
 void SetCodeProtection(int region, u32 offset, bool protect);
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 34c1c91..d8bdd56 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -40,6 +40,12 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+#ifdef _WIN32
+const BitSet32 CallerSavedPushRegs({R10, R11});
+#else
+const BitSet32 CallerSavedPushRegs({R9, R10, R11});
+#endif
+
 void Compiler::PushRegs(bool saveHiRegs)
 {
     BitSet32 loadedRegs(RegCache.LoadedRegs);
@@ -301,6 +307,107 @@ Compiler::Compiler()
         RET();
     }
 
+    for (int consoleType = 0; consoleType < 2; consoleType++)
+    {
+        for (int num = 0; num < 2; num++)
+        {
+            for (int size = 0; size < 3; size++)
+            {
+                for (int reg = 0; reg < 16; reg++)
+                {
+                    if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3)
+                    {
+                        PatchedStoreFuncs[consoleType][num][size][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL;
+                        continue;
+                    }
+
+                    X64Reg rdMapped = (X64Reg)reg;
+                    PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr();
+                    if (RSCRATCH3 != ABI_PARAM1)
+                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                    if (num == 0)
+                    {
+                        MOV(64, R(ABI_PARAM2), R(RCPU));
+                        MOV(32, R(ABI_PARAM3), R(rdMapped));
+                    }
+                    else
+                    {
+                        MOV(32, R(ABI_PARAM2), R(rdMapped));
+                    }
+                    ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    if (consoleType == 0)
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 0>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 0>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 0>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 0>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 0>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 0>); break;
+                        }
+                    }
+                    else
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 1>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 1>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 1>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 1>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 1>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 1>); break;
+                        }
+                    }
+                    ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    RET();
+
+                    for (int signextend = 0; signextend < 2; signextend++)
+                    {
+                        PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr();
+                        if (RSCRATCH3 != ABI_PARAM1)
+                            MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                        if (num == 0)
+                            MOV(64, R(ABI_PARAM2), R(RCPU));
+                        ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (consoleType == 0)
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 0>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 0>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 0>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 0>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 0>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 0>); break;
+                            }
+                        }
+                        else
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 1>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 1>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 1>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 1>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 1>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 1>); break;
+                            }
+                        }
+                        ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (signextend)
+                            MOVSX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        else
+                            MOVZX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        RET();
+                    }
+                }
+            }
+        }
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -500,6 +607,8 @@ void Compiler::Reset()
 
     NearCode = NearStart;
     FarCode = FarStart;
+
+    LoadStorePatches.clear();
 }
 
 bool Compiler::IsJITFault(u64 addr)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index d1a6c07..0fe0147 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -7,6 +7,8 @@
 #include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
+#include <unordered_map>
+
 namespace ARMJIT
 {
 
@@ -18,6 +20,13 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 const Gen::X64Reg RSCRATCH4 = Gen::R8;
 
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s16 Offset;
+    u16 Size;
+};
+
 struct Op2
 {
     Op2()
@@ -211,6 +220,11 @@ public:
     u8* NearStart;
     u8* FarStart;
 
+    void* PatchedStoreFuncs[2][2][3][16];
+    void* PatchedLoadFuncs[2][2][3][2][16];
+
+    std::unordered_map<u8*, LoadStorePatch> LoadStorePatches;
+
     u8* ResetStart;
     u32 CodeMemSize;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b780c55..2da113b 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -17,7 +17,30 @@ int squeezePointer(T* ptr)
 
 s32 Compiler::RewriteMemAccess(u64 pc)
 {
-    return 0;
+    auto it = LoadStorePatches.find((u8*)pc);
+    if (it != LoadStorePatches.end())
+    {
+        LoadStorePatch patch = it->second;
+        LoadStorePatches.erase(it);
+
+        u8* curCodePtr = GetWritableCodePtr();
+        u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset;
+        SetCodePtr(rewritePtr);
+
+        CALL(patch.PatchFunc);
+        u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr);
+        if (remainingSize > 0)
+            NOP(remainingSize);
+
+        //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size);
+
+        SetCodePtr(curCodePtr);
+
+        return patch.Offset;
+    }
+
+    printf("this is a JIT bug %x\n", pc);
+    abort();
 }
 
 /*
@@ -91,369 +114,213 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag
             return;
     }
 
+    if (flags & memop_Store)
     {
-        if (flags & memop_Store)
-        {
-            Comp_AddCycles_CD();
-        }
-        else
-        {
-            Comp_AddCycles_CDI();
-        }
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-        bool addrIsStatic = Config::JIT_LiteralOptimisations
-            && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
-        u32 staticAddress;
-        if (addrIsStatic)
-            staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        OpArg rdMapped = MapReg(rd);
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+    OpArg rdMapped = MapReg(rd);
 
-        if (true)
-        {
-            OpArg rnMapped = MapReg(rn);
-            if (Thumb && rn == 15)
-                rnMapped = Imm32(R15 & ~0x2);
+    OpArg rnMapped = MapReg(rn);
+    if (Thumb && rn == 15)
+        rnMapped = Imm32(R15 & ~0x2);
 
-            X64Reg finalAddr = RSCRATCH3;
-            if (flags & memop_Post)
-            {
-                MOV(32, R(RSCRATCH3), rnMapped);
+    X64Reg finalAddr = RSCRATCH3;
+    if (flags & memop_Post)
+    {
+        MOV(32, R(RSCRATCH3), rnMapped);
 
-                finalAddr = rnMapped.GetSimpleReg();
-            }
+        finalAddr = rnMapped.GetSimpleReg();
+    }
 
-            if (op2.IsImm)
+    if (op2.IsImm)
+    {
+        MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+    }
+    else
+    {
+        OpArg rm = MapReg(op2.Reg.Reg);
+
+        if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+            && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+        {
+            LEA(32, finalAddr, 
+                MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+        }
+        else
+        {
+            bool throwAway;
+            OpArg offset =
+                Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+
+            if (flags & memop_SubtractOffset)
             {
-                MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+                if (R(finalAddr) != rnMapped)
+                    MOV(32, R(finalAddr), rnMapped);
+                if (!offset.IsZero())
+                    SUB(32, R(finalAddr), offset);
             }
             else
-            {
-                OpArg rm = MapReg(op2.Reg.Reg);
+                MOV_sum(32, finalAddr, rnMapped, offset);
+        }
+    }
 
-                if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
-                    && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
-                {
-                    LEA(32, finalAddr, 
-                        MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
-                }
-                else
-                {
-                    bool throwAway;
-                    OpArg offset =
-                        Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+    if ((flags & memop_Writeback) && !(flags & memop_Post))
+        MOV(32, rnMapped, R(finalAddr));
 
-                    if (flags & memop_SubtractOffset)
-                    {
-                        if (R(finalAddr) != rnMapped)
-                            MOV(32, R(finalAddr), rnMapped);
-                        if (!offset.IsZero())
-                            SUB(32, R(finalAddr), offset);
-                    }
-                    else
-                        MOV_sum(32, finalAddr, rnMapped, offset);
-                }
-            }
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
-            if ((flags & memop_Writeback) && !(flags & memop_Post))
-                MOV(32, rnMapped, R(finalAddr));
-        }
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
+    {
+        u8* memopStart = GetWritableCodePtr();
+        LoadStorePatch patch;
+
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()]
+            : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()];
 
-        /*int expectedTarget = Num == 0
-            ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
-            : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
-        if (CurInstr.Cond() < 0xE)
-            expectedTarget = memregion_Other;
+        assert(patch.PatchFunc != NULL);
 
-        bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store);
+        MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
 
-        switch (expectedTarget)
+        X64Reg maskedAddr = RSCRATCH3;
+        if (size > 8)
         {
-        case memregion_MainRAM:
-        case memregion_DTCM:
-        case memregion_WRAM7:
-        case memregion_SWRAM9:
-        case memregion_SWRAM7:
-        case memregion_IO9:
-        case memregion_IO7:
-        case memregion_VWRAM:
-            compileFastPath = true;
-            break;
-        case memregion_Wifi:
-            compileFastPath = size >= 16;
-            break;
-        case memregion_VRAM:
-            compileFastPath = !(flags & memop_Store) || size >= 16;
-        case memregion_BIOS9:
-            compileFastPath = !(flags & memop_Store);
-            break;
-        default: break;
+            maskedAddr = RSCRATCH2;
+            MOV(32, R(RSCRATCH2), R(RSCRATCH3));
+            AND(32, R(RSCRATCH2), Imm8(addressMask));
         }
 
-        if (addrIsStatic && !compileFastPath)
+        u8* memopLoadStoreLocation = GetWritableCodePtr();
+        if (flags & memop_Store)
         {
-            compileFastPath = false;
-            compileSlowPath = true;
+            MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped);
         }
-
-        if (addrIsStatic && compileSlowPath)
-            MOV(32, R(RSCRATCH3), Imm32(staticAddress));
-*/
-        /*if (compileFastPath)
+        else
         {
-            FixupBranch slowPath;
-            if (compileSlowPath)
-            {
-                MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                SHR(32, R(RSCRATCH), Imm8(9));
-                if (flags & memop_Store)
-                {
-                    CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
-                }
-                else
-                {
-                    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
-                    AND(32, R(RSCRATCH), Imm8(~0x80));
-                    CMP(32, R(RSCRATCH), Imm8(expectedTarget));
-                }
-
-                slowPath = J_CC(CC_NE, true);
-            }
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
 
-            if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7
-                || expectedTarget == memregion_BIOS9)
+            if (size == 32)
             {
-                u8* data;
-                u32 mask;
-                if (expectedTarget == memregion_MainRAM)
-                {
-                    data = NDS::MainRAM;
-                    mask = MAIN_RAM_SIZE - 1;
-                }
-                else if (expectedTarget == memregion_BIOS9)
-                {
-                    data = NDS::ARM9BIOS;
-                    mask = 0xFFF;
-                }
-                else
-                {
-                    data = NDS::ARM7WRAM;
-                    mask = 0xFFFF;
-                }
-                OpArg memLoc;
-                if (addrIsStatic)
-                {
-                    memLoc = M(data + ((staticAddress & mask & addressMask)));
-                }
-                else
-                {
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                    AND(32, R(RSCRATCH), Imm32(mask & addressMask));
-                    memLoc = MDisp(RSCRATCH, squeezePointer(data));
-                }
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
-            }
-            else if (expectedTarget == memregion_DTCM)
-            {
-                if (addrIsStatic)
-                    MOV(32, R(RSCRATCH), Imm32(staticAddress));
-                else
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-                AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
-                OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM));
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
-            }
-            else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7)
-            {
-                MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
-                if (addrIsStatic)
-                {
-                    MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask));
-                }
-                else
-                {
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                    AND(32, R(RSCRATCH), Imm8(addressMask));
-                }
-                AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
-                OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2);
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                AND(32, R(RSCRATCH3), Imm8(0x3));
+                SHL(32, R(RSCRATCH3), Imm8(3));
+                ROR_(32, rdMapped, R(RSCRATCH3));
             }
-            else
-            {
-                u32 maskedDataRegion;
-
-                if (addrIsStatic)
-                {
-                    maskedDataRegion = staticAddress;
-                    MOV(32, R(ABI_PARAM1), Imm32(staticAddress));
-                }
-                else
-                {
-                    if (ABI_PARAM1 != RSCRATCH3)
-                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                    AND(32, R(ABI_PARAM1), Imm8(addressMask));
-
-                    maskedDataRegion = CurInstr.DataRegion;
-                    if (Num == 0)
-                        maskedDataRegion &= ~0xFFFFFF;
-                    else
-                        maskedDataRegion &= ~0x7FFFFF;
-                }
+        }
 
-                void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size);
+        patch.Offset = memopStart - memopLoadStoreLocation;
+        patch.Size = GetWritableCodePtr() - memopStart;
 
-                if (flags & memop_Store)
-                {
-                    PushRegs(false);
+        assert(patch.Size >= 5);
 
-                    MOV(32, R(ABI_PARAM2), rdMapped);
+        LoadStorePatches[memopLoadStoreLocation] = patch;
+    }
+    else
+    {
+        PushRegs(false);
 
-                    ABI_CallFunction((void(*)())func);
+        if (Num == 0)
+        {
+            MOV(64, R(ABI_PARAM2), R(RCPU));
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
+            {
+                MOV(32, R(ABI_PARAM3), rdMapped);
 
-                    PopRegs(false);
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    if (!addrIsStatic)
-                        MOV(32, rdMapped, R(RSCRATCH3));
-
-                    PushRegs(false);
-
-                    ABI_CallFunction((void(*)())func);
-
-                    PopRegs(false);
-
-                    if (!addrIsStatic)
-                        MOV(32, R(RSCRATCH3), rdMapped);
-
-                    if (flags & memop_SignExtend)
-                        MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-                    else
-                        MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                case 32: CALL((void*)&SlowWrite9<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite9<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite9<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite9<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite9<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite9<u8, 1>); break;
                 }
             }
-
-            if ((size == 32 && !(flags & memop_Store)))
+            else
             {
-                if (addrIsStatic)
-                {
-                    if (staticAddress & 0x3)
-                        ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8));
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    AND(32, R(RSCRATCH3), Imm8(0x3));
-                    SHL(32, R(RSCRATCH3), Imm8(3));
-                    ROR_(32, rdMapped, R(RSCRATCH3));
+                case 32: CALL((void*)&SlowRead9<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead9<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead9<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead9<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead9<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead9<u8, 1>); break;
                 }
             }
-
-            if (compileSlowPath)
-            {
-                SwitchToFarCode();
-                SetJumpTarget(slowPath);
-            }
         }
-*/
-        if (true)
+        else
         {
-            PushRegs(false);
-
-            if (Num == 0)
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
             {
-                MOV(64, R(ABI_PARAM2), R(RCPU));
-                if (ABI_PARAM1 != RSCRATCH3)
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                if (flags & memop_Store)
-                {
-                    MOV(32, R(ABI_PARAM3), rdMapped);
+                MOV(32, R(ABI_PARAM2), rdMapped);
 
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowWrite9<u32>); break;
-                    case 16: CALL((void*)&SlowWrite9<u16>); break;
-                    case 8: CALL((void*)&SlowWrite9<u8>); break;
-                    }
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowRead9<u32>); break;
-                    case 16: CALL((void*)&SlowRead9<u16>); break;
-                    case 8: CALL((void*)&SlowRead9<u8>); break;
-                    }
+                case 32: CALL((void*)&SlowWrite7<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite7<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite7<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite7<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite7<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite7<u8, 1>); break;
                 }
             }
             else
             {
-                if (ABI_PARAM1 != RSCRATCH3)
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                if (flags & memop_Store)
+                switch (size | NDS::ConsoleType)
                 {
-                    MOV(32, R(ABI_PARAM2), rdMapped);
-
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowWrite7<u32>); break;
-                    case 16: CALL((void*)&SlowWrite7<u16>); break;
-                    case 8: CALL((void*)&SlowWrite7<u8>); break;
-                    }
-                }
-                else
-                {
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowRead7<u32>); break;
-                    case 16: CALL((void*)&SlowRead7<u16>); break;
-                    case 8: CALL((void*)&SlowRead7<u8>); break;
-                    }
+                case 32: CALL((void*)&SlowRead7<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead7<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead7<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead7<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead7<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead7<u8, 1>); break;
                 }
             }
+        }
 
-            PopRegs(false);
+        PopRegs(false);
 
-            if (!(flags & memop_Store))
-            {
-                if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-            }
-        }
-/*
-        if (compileFastPath && compileSlowPath)
+        if (!(flags & memop_Store))
         {
-            FixupBranch ret = J(true);
-            SwitchToNearCode();
-            SetJumpTarget(ret);
-        }*/
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+    }
 
-        if (!(flags & memop_Store) && rd == 15)
+    if (!(flags & memop_Store) && rd == 15)
+    {
+        if (size < 32)
+            printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
         {
-            if (size < 32)
-                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            if (Num == 1)
             {
-                if (Num == 1)
-                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
-                Comp_JumpTo(rdMapped.GetSimpleReg());
+                if (Thumb)
+                    OR(32, rdMapped, Imm8(0x1));
+                else
+                    AND(32, rdMapped, Imm8(0xFE));
             }
+            Comp_JumpTo(rdMapped.GetSimpleReg());
         }
     }
 }
@@ -470,7 +337,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         int flags = 0;
         if (store)
             flags |= memop_Store;
-        if (decrement)
+        if (decrement && preinc)
             flags |= memop_SubtractOffset;
         Op2 offset = preinc ? Op2(4) : Op2(0);
 
@@ -481,96 +348,52 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    // we need to make sure that the stack stays aligned to 16 bytes
-#ifdef _WIN32
-    // include shadow
-    u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8;
-#else
-    u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
-#endif
-    u32 allocOffset = stackAlloc - regsCount * 8;
-/*
     int expectedTarget = Num == 0
-        ? ClassifyAddress9(CurInstr.DataRegion)
-        : ClassifyAddress7(CurInstr.DataRegion);
-    if (usermode || CurInstr.Cond() < 0xE)
-        expectedTarget = memregion_Other;
-
-    bool compileFastPath = false;
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
-    switch (expectedTarget)
-    {
-    case memregion_DTCM:
-    case memregion_MainRAM:
-    case memregion_SWRAM9:
-    case memregion_SWRAM7:
-    case memregion_WRAM7:
-        compileFastPath = true;
-        break;
-    default:
-        break;
-    }
-*/
     if (!store)
         Comp_AddCycles_CDI();
     else
         Comp_AddCycles_CD();
 
+    bool compileFastPath = Config::JIT_FastMemory
+        && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
+
+    // we need to make sure that the stack stays aligned to 16 bytes
+#ifdef _WIN32
+    // include shadow
+    u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#else
+    u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#endif
+    u32 allocOffset = stackAlloc - regsCount * 8;
+
     if (decrement)
-    {
-        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4));
-        preinc ^= true;
-    }
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4)));
     else
-        MOV(32, R(RSCRATCH4), MapReg(rn));
-/*
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0));
+
     if (compileFastPath)
     {
-        assert(!usermode);
+        AND(32, R(RSCRATCH4), Imm8(~3));
 
-        MOV(32, R(RSCRATCH), R(RSCRATCH4));
-        SHR(32, R(RSCRATCH), Imm8(9));
+        u8* fastPathStart = GetWritableCodePtr();
+        u8* firstLoadStoreAddr;
 
-        if (store)
-        {
-            CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
-        }
-        else
-        {
-            MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
-            AND(32, R(RSCRATCH), Imm8(~0x80));
-            CMP(32, R(RSCRATCH), Imm8(expectedTarget));
-        }
-        FixupBranch slowPath = J_CC(CC_NE, true);
+        bool firstLoadStore = true;
+
+        MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH4));
+        MOV(32, R(RSCRATCH3), R(RSCRATCH4));
 
-        if (expectedTarget == memregion_DTCM)
-        {
-            SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-            AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3));
-            LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM)));
-        }
-        else if (expectedTarget == memregion_MainRAM)
-        {
-            AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3));
-            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM)));
-        }
-        else if (expectedTarget == memregion_WRAM7)
-        {
-            AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3));
-            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM)));
-        }
-        else // SWRAM
-        {
-            AND(32, R(RSCRATCH4), Imm8(~3));
-            AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
-            ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
-        }
         u32 offset = 0;
         for (int reg : regs)
         {
-            if (preinc)
-                offset += 4;
-            OpArg mem = MDisp(RSCRATCH4, offset);
+            if (firstLoadStore)
+                firstLoadStoreAddr = GetWritableCodePtr();
+
+            OpArg mem = MDisp(RSCRATCH2, offset);
             if (store)
             {
                 if (RegCache.LoadedRegs & (1 << reg))
@@ -580,6 +403,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 else
                 {
                     LoadReg(reg, RSCRATCH);
+                    if (firstLoadStore)
+                        firstLoadStoreAddr = GetWritableCodePtr();
                     MOV(32, mem, R(RSCRATCH));
                 }
             }
@@ -595,13 +420,19 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     SaveReg(reg, RSCRATCH);
                 }
             }
-            if (!preinc)
-                offset += 4;
+            offset += 4;
+
+            firstLoadStore = false;
         }
 
+        LoadStorePatch patch;
+        patch.Size = GetWritableCodePtr() - fastPathStart;
+        patch.Offset = fastPathStart - firstLoadStoreAddr;
         SwitchToFarCode();
-        SetJumpTarget(slowPath);
-    }*/
+        patch.PatchFunc = GetWritableCodePtr();
+
+        LoadStorePatches[firstLoadStoreAddr] = patch;
+    }
 
     if (!store)
     {
@@ -618,12 +449,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (Num == 0)
             MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        switch (Num * 2 | preinc)
+        switch (Num * 2 | NDS::ConsoleType)
         {
-        case 0: CALL((void*)&SlowBlockTransfer9<false, false>); break;
-        case 1: CALL((void*)&SlowBlockTransfer9<true, false>); break;
-        case 2: CALL((void*)&SlowBlockTransfer7<false, false>); break;
-        case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
+        case 0: CALL((void*)&SlowBlockTransfer9<false, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<false, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<false, 1>); break;
         }
 
         PopRegs(false);
@@ -715,25 +546,24 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (Num == 0)
             MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        switch (Num * 2 | preinc)
+        switch (Num * 2 | NDS::ConsoleType)
         {
-        case 0: CALL((void*)&SlowBlockTransfer9<false, true>); break;
-        case 1: CALL((void*)&SlowBlockTransfer9<true, true>); break;
-        case 2: CALL((void*)&SlowBlockTransfer7<false, true>); break;
-        case 3: CALL((void*)&SlowBlockTransfer7<true, true>); break;
+        case 0: CALL((void*)&SlowBlockTransfer9<true, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<true, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, 1>); break;
         }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     
         PopRegs(false);
     }
-/*
+
     if (compileFastPath)
     {
-        FixupBranch ret = J(true);
+        RET();
         SwitchToNearCode();
-        SetJumpTarget(ret);
-    }*/
+    }
 
     if (!store && regs[15])
     {
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 3d64259..992c83f 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -608,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val)
         ITCMSetting = val;
         UpdateITCMSetting();
         return;
+
+    case 0xF00:
+        //printf("cache debug index register %08X\n", val);
+        return;
+    
+    case 0xF10:
+        //printf("cache debug instruction tag %08X\n", val);
+        return;
+    
+    case 0xF20:
+        //printf("cache debug data tag %08X\n", val);
+        return;
+
+    case 0xF30:
+        //printf("cache debug instruction cache %08X\n", val);
+        return;
+
+    case 0xF40:
+        //printf("cache debug data cache %08X\n", val);
+        return;
+    
     }
 
     if ((id&0xF00)!=0x700)
diff --git a/src/Config.cpp b/src/Config.cpp
index edf84f2..de1c70d 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -40,14 +40,7 @@ char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
 int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = 2;
-int JIT_LiteralOptimisations = true;
-#endif
-
-#ifdef JIT_ENABLED
-int JIT_Enable = false;
-int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = true;
+int JIT_BranchOptimisations = 2;
 int JIT_LiteralOptimisations = true;
 int JIT_FastMemory = true;
 #endif
@@ -66,16 +59,9 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
-    {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
-#endif
-
-#ifdef JIT_ENABLED
-    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
-    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
-    {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0},
+    {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 7b19a4b..5916b4a 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -54,14 +54,7 @@ extern char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern int JIT_BrancheOptimisations;
-extern int JIT_LiteralOptimisations;
-#endif
-
-#ifdef JIT_ENABLED
-extern int JIT_Enable;
-extern int JIT_MaxBlockSize;
-extern int JIT_BrancheOptimisations;
+extern int JIT_BranchOptimisations;
 extern int JIT_LiteralOptimisations;
 extern int JIT_FastMemory;
 #endif
diff --git a/src/DSi.cpp b/src/DSi.cpp
index 216f724..97a63cd 100644
--- a/src/DSi.cpp
+++ b/src/DSi.cpp
@@ -26,6 +26,11 @@
 #include "NDSCart.h"
 #include "Platform.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
+
 #include "DSi_NDMA.h"
 #include "DSi_I2C.h"
 #include "DSi_SD.h"
@@ -34,15 +39,6 @@
 #include "tiny-AES-c/aes.hpp"
 
 
-namespace NDS
-{
-
-extern ARMv5* ARM9;
-extern ARMv4* ARM7;
-
-}
-
-
 namespace DSi
 {
 
@@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000];
 
 u32 MBK[2][9];
 
-u8 NWRAM_A[0x40000];
-u8 NWRAM_B[0x40000];
-u8 NWRAM_C[0x40000];
+u8* NWRAM_A;
+u8* NWRAM_B;
+u8* NWRAM_C;
 
 u8* NWRAMMap_A[2][4];
 u8* NWRAMMap_B[3][8];
@@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00];
 
 bool Init()
 {
+#ifndef JIT_ENABLED
+    NWRAM_A = new u8[NWRAMSize];
+    NWRAM_B = new u8[NWRAMSize];
+    NWRAM_C = new u8[NWRAMSize];
+#endif
+
     if (!DSi_I2C::Init()) return false;
     if (!DSi_AES::Init()) return false;
 
@@ -106,6 +108,12 @@ bool Init()
 
 void DeInit()
 {
+#ifndef JIT_ENABLED
+    delete[] NWRAM_A;
+    delete[] NWRAM_B;
+    delete[] NWRAM_C;
+#endif
+
     DSi_I2C::DeInit();
     DSi_AES::DeInit();
 
@@ -176,7 +184,12 @@ void SoftReset()
     NDS::ARM9->Reset();
     NDS::ARM7->Reset();
 
+    NDS::ARM9->CP15Reset();
+
     memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000);
+#ifdef JIT_ENABLED
+    ARMJIT::CheckAndInvalidateITCM();
+#endif
 
     DSi_AES::Reset();
 
@@ -274,9 +287,9 @@ bool LoadNAND()
 {
     printf("Loading DSi NAND\n");
 
-    memset(NWRAM_A, 0, 0x40000);
-    memset(NWRAM_B, 0, 0x40000);
-    memset(NWRAM_C, 0, 0x40000);
+    memset(NWRAM_A, 0, NWRAMSize);
+    memset(NWRAM_B, 0, NWRAMSize);
+    memset(NWRAM_C, 0, NWRAMSize);
 
     memset(MBK, 0, sizeof(MBK));
     memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A));
@@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(0);
+
     int mbkn = 0, mbks = 8*num;
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(1);
+
     int mbkn = 1+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(2);
+
     int mbkn = 3+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val)
     u32 oldval = MBK[cpu][5+num];
     if (oldval == val) return;
 
+    ARMJIT_Memory::RemapNWRAM(num);
+
     MBK[cpu][5+num] = val;
 
     // TODO: what happens when the ranges are 'out of range'????
@@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write8(addr, val);
@@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write16(addr, val);
@@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write32(addr, val);
@@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+#endif
+            }
             return;
         }
         return NDS::ARM7Write8(addr, val);
@@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write16(addr, val);
@@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write32(addr, val);
@@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr)
     case 0x04004501: return DSi_I2C::Cnt;
 
     case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF;
-    case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
+    case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
     case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF;
     case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF;
     case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF;
diff --git a/src/DSi.h b/src/DSi.h
index 8cc8fd5..40f22bb 100644
--- a/src/DSi.h
+++ b/src/DSi.h
@@ -25,6 +25,8 @@
 namespace DSi
 {
 
+extern u16 SCFG_BIOS;
+
 extern u8 ARM9iBIOS[0x10000];
 extern u8 ARM7iBIOS[0x10000];
 
@@ -34,6 +36,19 @@ extern u64 ConsoleID;
 extern DSi_SDHost* SDMMC;
 extern DSi_SDHost* SDIO;
 
+const u32 NWRAMSize = 0x40000;
+
+extern u8* NWRAM_A;
+extern u8* NWRAM_B;
+extern u8* NWRAM_C;
+
+extern u8* NWRAMMap_A[2][4];
+extern u8* NWRAMMap_B[3][8];
+extern u8* NWRAMMap_C[3][8];
+
+extern u32 NWRAMStart[2][3];
+extern u32 NWRAMEnd[2][3];
+extern u32 NWRAMMask[2][3];
 
 bool Init();
 void DeInit();
diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp
index 9984f5e..e22c708 100644
--- a/src/DSi_I2C.cpp
+++ b/src/DSi_I2C.cpp
@@ -21,6 +21,7 @@
 #include "DSi.h"
 #include "DSi_I2C.h"
 #include "DSi_Camera.h"
+#include "ARM.h"
 
 
 namespace DSi_BPTWL
@@ -108,7 +109,8 @@ void Write(u8 val, bool last)
         printf("BPTWL: soft-reset\n");
         val = 0; // checkme
         // TODO: soft-reset might need to be scheduled later!
-        DSi::SoftReset();
+        // TODO: this has been moved for the JIT to work, nothing is confirmed here
+        NDS::ARM7->Halt(4);
         CurPos = -1;
         return;
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 3d65482..6981a42 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -32,8 +32,11 @@
 #include "Wifi.h"
 #include "AREngine.h"
 #include "Platform.h"
+
+#ifdef JIT_ENABLED
 #include "ARMJIT.h"
 #include "ARMJIT_Memory.h"
+#endif
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -173,7 +176,7 @@ bool Init()
 #ifdef JIT_ENABLED
     ARMJIT::Init();
 #else
-    MainRAM = new u8[MainRAMSize];
+    MainRAM = new u8[0x1000000];
     ARM7WRAM = new u8[ARM7WRAMSize];
     SharedWRAM = new u8[SharedWRAMSize];
 #endif
@@ -1837,7 +1840,7 @@ u8 ARM9Read8(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u8*)&MainRAM[addr & (MainRAMSize - 1)];
+        return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
@@ -1902,7 +1905,7 @@ u16 ARM9Read16(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u16*)&MainRAM[addr & (MainRAMSize - 1)];
+        return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
@@ -2031,16 +2034,13 @@ void ARM9Write8(u32 addr, u8 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2090,16 +2090,13 @@ void ARM9Write16(u32 addr, u16 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2168,16 +2165,13 @@ void ARM9Write32(u32 addr, u32 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return ;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2235,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val)
         return;
     }
 
-    printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
+    //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
 }
 
 bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
@@ -2475,16 +2469,13 @@ void ARM7Write8(u32 addr, u8 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
@@ -2552,16 +2543,13 @@ void ARM7Write16(u32 addr, u16 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
@@ -2639,16 +2627,13 @@ void ARM7Write32(u32 addr, u32 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
diff --git a/src/NDS.h b/src/NDS.h
index 4b4f9a1..e0a5045 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -165,6 +165,8 @@ extern u16 ARM7BIOSProt;
 extern u8* MainRAM;
 extern u32 MainRAMMask;
 
+const u32 MainRAMMaxSize = 0x1000000;
+
 const u32 SharedWRAMSize = 0x8000;
 extern u8* SharedWRAM;
 
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
index 09faf4e..9ee7b9a 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
@@ -32,6 +32,7 @@
 EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr;
 
 extern char* EmuDirectory;
+extern bool RunningSomething;
 
 
 EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog)
@@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new
     ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType);
 
     ui->chkDirectBoot->setChecked(Config::DirectBoot != 0);
+
+#ifdef JIT_ENABLED
+    ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0);
+    ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0);
+    ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0);
+    ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0);
+    ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize);
+#else
+    ui->chkEnableJIT->setDisabled(true);
+    ui->chkJITBranchOptimisations->setDisabled(true);
+    ui->chkJITLiteralOptimisations->setDisabled(true);
+    ui->chkJITFastMemory->setDisabled(true);
+    ui->spnJITMaximumBlockSize->setDisabled(true);
+#endif
+
+    on_chkEnableJIT_toggled();
 }
 
 EmuSettingsDialog::~EmuSettingsDialog()
@@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware()
     }
 }
 
-void EmuSettingsDialog::on_EmuSettingsDialog_accepted()
+void EmuSettingsDialog::done(int r)
 {
-    verifyFirmware();
-
-    strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0';
-    strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0';
-    strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0';
-
-    strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
-    strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
-    strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
-    strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
-
-    Config::ConsoleType = ui->cbxConsoleType->currentIndex();
-    Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0;
-
-    Config::Save();
+    if (r == QDialog::Accepted)
+    {
+        verifyFirmware();
+
+        int consoleType = ui->cbxConsoleType->currentIndex();
+        int directBoot = ui->chkDirectBoot->isChecked() ? 1:0;
+
+        int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0;
+        int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value();
+        int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0;
+        int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0;
+        int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0;
+
+        std::string bios9Path = ui->txtBIOS9Path->text().toStdString();
+        std::string bios7Path = ui->txtBIOS7Path->text().toStdString();
+        std::string firmwarePath = ui->txtFirmwarePath->text().toStdString();
+        std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString();
+        std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString();
+        std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString();
+        std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString();
+
+        if (consoleType != Config::ConsoleType
+            || directBoot != Config::DirectBoot
+#ifdef JIT_ENABLED
+            || jitEnable != Config::JIT_Enable
+            || jitMaxBlockSize != Config::JIT_MaxBlockSize
+            || jitBranchOptimisations != Config::JIT_BranchOptimisations
+            || jitLiteralOptimisations != Config::JIT_LiteralOptimisations
+            || jitFastMemory != Config::JIT_FastMemory
+#endif
+            || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0
+            || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0
+            || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0
+            || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0
+            || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0
+            || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0
+            || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0)
+        {
+            if (RunningSomething
+                && QMessageBox::warning(this, "Reset necessary to apply changes", 
+                    "The emulation will be reset for the changes to take place", 
+                    QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes)
+                return;
+
+            strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0';
+            strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0';
+            strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0';
+
+            strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
+            strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
+            strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
+            strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
+
+    #ifdef JIT_ENABLED
+            Config::JIT_Enable = jitEnable;
+            Config::JIT_MaxBlockSize = jitMaxBlockSize;
+            Config::JIT_BranchOptimisations = jitBranchOptimisations;
+            Config::JIT_LiteralOptimisations = jitLiteralOptimisations;
+            Config::JIT_FastMemory = jitFastMemory;
+    #endif
+
+            Config::ConsoleType = consoleType;
+            Config::DirectBoot = directBoot;
+
+            Config::Save();
+        }
+    }
 
-    closeDlg();
-}
+    QDialog::done(r);
 
-void EmuSettingsDialog::on_EmuSettingsDialog_rejected()
-{
     closeDlg();
 }
 
@@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked()
 
     ui->txtDSiNANDPath->setText(file);
 }
+
+void EmuSettingsDialog::on_chkEnableJIT_toggled()
+{
+    bool disabled = !ui->chkEnableJIT->isChecked();
+    ui->chkJITBranchOptimisations->setDisabled(disabled);
+    ui->chkJITLiteralOptimisations->setDisabled(disabled);
+    ui->chkJITFastMemory->setDisabled(disabled);
+    ui->spnJITMaximumBlockSize->setDisabled(disabled);
+}
\ No newline at end of file
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h
index f604ba5..268036c 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.h
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.h
@@ -51,8 +51,7 @@ public:
     }
 
 private slots:
-    void on_EmuSettingsDialog_accepted();
-    void on_EmuSettingsDialog_rejected();
+    void done(int r);
 
     void on_btnBIOS9Browse_clicked();
     void on_btnBIOS7Browse_clicked();
@@ -63,6 +62,8 @@ private slots:
     void on_btnDSiFirmwareBrowse_clicked();
     void on_btnDSiNANDBrowse_clicked();
 
+    void on_chkEnableJIT_toggled();
+
 private:
     void verifyFirmware();
 
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui
index 4894fa5..11d48cc 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.ui
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>490</width>
-    <height>392</height>
+    <width>514</width>
+    <height>359</height>
    </rect>
   </property>
   <property name="sizePolicy">
@@ -24,243 +24,336 @@
     <enum>QLayout::SetFixedSize</enum>
    </property>
    <item>
-    <widget class="QGroupBox" name="groupBox">
-     <property name="title">
-      <string>DS mode</string>
+    <widget class="QTabWidget" name="tabWidget">
+     <property name="currentIndex">
+      <number>0</number>
      </property>
-     <layout class="QGridLayout" name="gridLayout_2">
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="minimumSize">
-         <size>
-          <width>290</width>
-          <height>0</height>
-         </size>
-        </property>
-        <property name="statusTip">
-         <string/>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_3">
-        <property name="text">
-         <string>DS firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_2">
-        <property name="text">
-         <string>DS ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label">
-        <property name="text">
-         <string>DS ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnBIOS9Browse">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-        <property name="autoDefault">
-         <bool>true</bool>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_3">
-     <property name="title">
-      <string>DSi mode</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout_3">
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS9Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_5">
-        <property name="text">
-         <string>DSi ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnDSiFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtDSiFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_6">
-        <property name="text">
-         <string>DSi ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_7">
-        <property name="text">
-         <string>DSi firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0">
-       <widget class="QLabel" name="label_8">
-        <property name="text">
-         <string>DSi NAND:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="1">
-       <widget class="QLineEdit" name="txtDSiNANDPath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="2">
-       <widget class="QPushButton" name="btnDSiNANDBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>General</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_4">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Console type:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="cbxConsoleType">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0" colspan="2">
-       <widget class="QCheckBox" name="chkDirectBoot">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-        <property name="text">
-         <string>Boot game directly</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
+     <widget class="QWidget" name="tab">
+      <attribute name="title">
+       <string>General</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_4">
+       <item row="1" column="1">
+        <widget class="QComboBox" name="cbxConsoleType">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="1">
+        <widget class="QCheckBox" name="chkDirectBoot">
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+         <property name="text">
+          <string>Boot game directly</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <spacer name="verticalSpacer_2">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_4">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="text">
+          <string>Console type:</string>
+         </property>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_2">
+      <attribute name="title">
+       <string>BIOS Files</string>
+      </attribute>
+      <layout class="QVBoxLayout" name="verticalLayout_2">
+       <item>
+        <widget class="QGroupBox" name="groupBox">
+         <property name="title">
+          <string>DS mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_2">
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>DS firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnBIOS9Browse">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+            <property name="autoDefault">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>DS ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>DS ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="minimumSize">
+             <size>
+              <width>290</width>
+              <height>0</height>
+             </size>
+            </property>
+            <property name="statusTip">
+             <string/>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_3">
+         <property name="title">
+          <string>DSi mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_3">
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS9Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label_5">
+            <property name="text">
+             <string>DSi ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnDSiFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtDSiFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_6">
+            <property name="text">
+             <string>DSi ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_7">
+            <property name="text">
+             <string>DSi firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0">
+           <widget class="QLabel" name="label_8">
+            <property name="text">
+             <string>DSi NAND:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="1">
+           <widget class="QLineEdit" name="txtDSiNANDPath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="2">
+           <widget class="QPushButton" name="btnDSiNANDBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_3">
+      <attribute name="title">
+       <string>CPU Emulation</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_5">
+       <item row="0" column="0">
+        <widget class="QCheckBox" name="chkEnableJIT">
+         <property name="text">
+          <string>Enable JIT recompiler</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_9">
+         <property name="text">
+          <string>Maximum JIT block size:</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="1">
+        <widget class="QSpinBox" name="spnJITMaximumBlockSize">
+         <property name="minimum">
+          <number>1</number>
+         </property>
+         <property name="maximum">
+          <number>32</number>
+         </property>
+         <property name="value">
+          <number>32</number>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="0">
+        <widget class="QCheckBox" name="chkJITBranchOptimisations">
+         <property name="text">
+          <string>Branch Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <widget class="QCheckBox" name="chkJITLiteralOptimisations">
+         <property name="text">
+          <string>Literal Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="4" column="0">
+        <widget class="QCheckBox" name="chkJITFastMemory">
+         <property name="text">
+          <string>Fast Memory</string>
+         </property>
+        </widget>
+       </item>
+       <item row="5" column="0">
+        <spacer name="verticalSpacer">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
     </widget>
    </item>
    <item>
@@ -275,6 +368,27 @@
    </item>
   </layout>
  </widget>
+ <tabstops>
+  <tabstop>tabWidget</tabstop>
+  <tabstop>cbxConsoleType</tabstop>
+  <tabstop>chkDirectBoot</tabstop>
+  <tabstop>txtBIOS9Path</tabstop>
+  <tabstop>txtBIOS7Path</tabstop>
+  <tabstop>txtFirmwarePath</tabstop>
+  <tabstop>txtDSiBIOS9Path</tabstop>
+  <tabstop>txtDSiBIOS7Path</tabstop>
+  <tabstop>txtDSiFirmwarePath</tabstop>
+  <tabstop>txtDSiNANDPath</tabstop>
+  <tabstop>btnBIOS9Browse</tabstop>
+  <tabstop>btnBIOS7Browse</tabstop>
+  <tabstop>btnFirmwareBrowse</tabstop>
+  <tabstop>btnDSiBIOS9Browse</tabstop>
+  <tabstop>btnDSiBIOS7Browse</tabstop>
+  <tabstop>btnDSiFirmwareBrowse</tabstop>
+  <tabstop>btnDSiNANDBrowse</tabstop>
+  <tabstop>chkEnableJIT</tabstop>
+  <tabstop>spnJITMaximumBlockSize</tabstop>
+ </tabstops>
  <resources/>
  <connections>
   <connection>
@@ -284,8 +398,8 @@
    <slot>accept()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>248</x>
-     <y>254</y>
+     <x>257</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>157</x>
@@ -300,8 +414,8 @@
    <slot>reject()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>316</x>
-     <y>260</y>
+     <x>325</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>286</x>
diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp
index fa542ad..4557d0e 100644
--- a/src/frontend/qt_sdl/main.cpp
+++ b/src/frontend/qt_sdl/main.cpp
@@ -1641,7 +1641,14 @@ void MainWindow::onStop()
 
 void MainWindow::onOpenEmuSettings()
 {
-    EmuSettingsDialog::openDlg(this);
+    EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this);
+    connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished);
+}
+
+void MainWindow::onEmuSettingsDialogFinished(int res)
+{
+    if (RunningSomething)
+        onReset();
 }
 
 void MainWindow::onOpenInputConfig()
diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h
index 279aed8..eec2a48 100644
--- a/src/frontend/qt_sdl/main.h
+++ b/src/frontend/qt_sdl/main.h
@@ -199,6 +199,7 @@ private slots:
     void onStop();
 
     void onOpenEmuSettings();
+    void onEmuSettingsDialogFinished(int res);
     void onOpenInputConfig();
     void onInputConfigFinished(int res);
     void onOpenVideoSettings();
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
deleted file mode 100644
index 0df9c6c..0000000
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-    Copyright 2016-2020 Arisotura
-
-    This file is part of melonDS.
-
-    melonDS is free software: you can redistribute it and/or modify it under
-    the terms of the GNU General Public License as published by the Free
-    Software Foundation, either version 3 of the License, or (at your option)
-    any later version.
-
-    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
-    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with melonDS. If not, see http://www.gnu.org/licenses/.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "libui/ui.h"
-
-#include "../types.h"
-#include "PlatformConfig.h"
-
-#include "DlgEmuSettings.h"
-
-
-void ApplyNewSettings(int type);
-
-extern bool RunningSomething;
-
-namespace DlgEmuSettings
-{
-
-bool opened;
-uiWindow* win;
-
-uiCheckbox* cbDirectBoot;
-
-#ifdef JIT_ENABLED
-uiCheckbox* cbJITEnabled;
-uiEntry* enJITMaxBlockSize;
-uiCheckbox* cbJITBranchOptimisations;
-uiCheckbox* cbJITLiteralOptimisations;
-#endif
-
-int OnCloseWindow(uiWindow* window, void* blarg)
-{
-    opened = false;
-    return 1;
-}
-
-void OnCancel(uiButton* btn, void* blarg)
-{
-    uiControlDestroy(uiControl(win));
-    opened = false;
-}
-
-void OnOk(uiButton* btn, void* blarg)
-{
-#ifdef JIT_ENABLED
-    bool restart = false;
-
-    bool enableJit = uiCheckboxChecked(cbJITEnabled);
-    char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
-    long blockSize = strtol(maxBlockSizeStr, NULL, 10);
-    bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
-    bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations);
-    uiFreeText(maxBlockSizeStr);
-    if (blockSize < 1)
-        blockSize = 1;
-    if (blockSize > 32)
-        blockSize = 32;
-
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize
-        || branchOptimisations != Config::JIT_BrancheOptimisations
-        || literalOptimisations != Config::JIT_LiteralOptimisations)
-    {
-        if (RunningSomething && 
-            !uiMsgBoxConfirm(win, "Reset emulator", 
-                "Changing JIT settings requires a reset.\n\nDo you want to continue?"))
-            return;
-
-        Config::JIT_Enable = enableJit;
-        Config::JIT_MaxBlockSize = blockSize;
-        Config::JIT_BrancheOptimisations = branchOptimisations;
-        Config::JIT_LiteralOptimisations = literalOptimisations;
-
-        restart = true;
-    }
-#endif
-
-    Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
-
-    Config::Save();
-
-    uiControlDestroy(uiControl(win));
-    opened = false;
-
-#ifdef JIT_ENABLED
-    if (restart)
-        ApplyNewSettings(4);
-#endif
-}
-
-#ifdef JIT_ENABLED
-void OnJITStateChanged(uiCheckbox* cb, void* blarg)
-{
-    if (uiCheckboxChecked(cb))
-    {
-        uiControlEnable(uiControl(enJITMaxBlockSize));
-        uiControlEnable(uiControl(cbJITBranchOptimisations));
-        uiControlEnable(uiControl(cbJITLiteralOptimisations));
-    }
-    else
-    {
-        uiControlDisable(uiControl(enJITMaxBlockSize));
-        uiControlDisable(uiControl(cbJITBranchOptimisations));
-        uiControlDisable(uiControl(cbJITLiteralOptimisations));
-    }
-}
-#endif
-
-void Open()
-{
-    if (opened)
-    {
-        uiControlSetFocus(uiControl(win));
-        return;
-    }
-
-    opened = true;
-    win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0);
-    uiWindowSetMargined(win, 1);
-    uiWindowOnClosing(win, OnCloseWindow, NULL);
-
-    uiBox* top = uiNewVerticalBox();
-    uiWindowSetChild(win, uiControl(top));
-
-    {
-        uiBox* in_ctrl = uiNewVerticalBox();
-        uiBoxAppend(top, uiControl(in_ctrl), 0);
-
-        cbDirectBoot = uiNewCheckbox("Boot game directly");
-        uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0);
-    }
-
-#ifdef JIT_ENABLED
-    {
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(top, uiControl(dummy), 0);
-    }
-
-    {
-        uiGroup* grp = uiNewGroup("JIT");
-        uiBoxAppend(top, uiControl(grp), 1);
-
-        uiBox* in_ctrl = uiNewVerticalBox();
-        uiGroupSetChild(grp, uiControl(in_ctrl));
-
-        cbJITEnabled = uiNewCheckbox("Enable JIT recompiler");
-        uiBoxAppend(in_ctrl, uiControl(cbJITEnabled), 0);
-
-        uiCheckboxOnToggled(cbJITEnabled, OnJITStateChanged, NULL);
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            uiLabel* lbl = uiNewLabel("Maximum block size (1-32): ");
-            uiBoxAppend(row, uiControl(lbl), 0);
-
-            enJITMaxBlockSize = uiNewEntry();
-            uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:");
-            uiBoxAppend(row, uiControl(lbl), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations");
-            uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations");
-            uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0);
-        }
-    }
-#endif
-
-    {
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(top, uiControl(dummy), 0);
-    }
-
-    {
-        uiBox* in_ctrl = uiNewHorizontalBox();
-        uiBoxSetPadded(in_ctrl, 1);
-        uiBoxAppend(top, uiControl(in_ctrl), 0);
-
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(in_ctrl, uiControl(dummy), 1);
-
-        uiButton* btncancel = uiNewButton("Cancel");
-        uiButtonOnClicked(btncancel, OnCancel, NULL);
-        uiBoxAppend(in_ctrl, uiControl(btncancel), 0);
-
-        uiButton* btnok = uiNewButton("Ok");
-        uiButtonOnClicked(btnok, OnOk, NULL);
-        uiBoxAppend(in_ctrl, uiControl(btnok), 0);
-    }
-
-    uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
-
-#ifdef JIT_ENABLED
-    uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable);
-    {
-        char maxBlockSizeStr[10];
-        sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize);
-        uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
-    }
-    OnJITStateChanged(cbJITEnabled, NULL);
-
-    uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
-    uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations);
-#endif
-
-    uiControlShow(uiControl(win));
-}
-
-void Close()
-{
-    if (!opened) return;
-    uiControlDestroy(uiControl(win));
-    opened = false;
-}
-
-}
diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h
deleted file mode 100644
index e45fe91..0000000
--- a/src/libui_sdl/libui/ui.h
+++ /dev/null
@@ -1,764 +0,0 @@
-// 6 april 2015
-
-// TODO add a uiVerifyControlType() function that can be used by control implementations to verify controls
-
-#ifndef __LIBUI_UI_H__
-#define __LIBUI_UI_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// this macro is generated by cmake
-#ifdef libui_EXPORTS
-#ifdef _WIN32
-#define _UI_EXTERN __declspec(dllexport) extern
-#else
-#define _UI_EXTERN __attribute__((visibility("default"))) extern
-#endif
-#else
-// TODO add __declspec(dllimport) on windows, but only if not static
-#define _UI_EXTERN extern
-#endif
-
-// C++ is really really really really really really dumb about enums, so screw that and just make them anonymous
-// This has the advantage of being ABI-able should we ever need an ABI...
-#define _UI_ENUM(s) typedef unsigned int s; enum
-
-// This constant is provided because M_PI is nonstandard.
-// This comes from Go's math.Pi, which in turn comes from http://oeis.org/A000796.
-#define uiPi 3.14159265358979323846264338327950288419716939937510582097494459
-
-// TODO uiBool?
-
-typedef struct uiInitOptions uiInitOptions;
-
-struct uiInitOptions {
-	size_t Size;
-};
-
-_UI_EXTERN const char *uiInit(uiInitOptions *options);
-_UI_EXTERN void uiUninit(void);
-_UI_EXTERN void uiFreeInitError(const char *err);
-
-_UI_EXTERN void uiMain(void);
-_UI_EXTERN void uiMainSteps(void);
-_UI_EXTERN int uiMainStep(int wait);
-_UI_EXTERN void uiQuit(void);
-
-_UI_EXTERN void uiQueueMain(void (*f)(void *data), void *data);
-
-_UI_EXTERN void uiOnShouldQuit(int (*f)(void *data), void *data);
-
-_UI_EXTERN void uiFreeText(char *text);
-
-typedef struct uiControl uiControl;
-
-struct uiControl {
-	uint32_t Signature;
-	uint32_t OSSignature;
-	uint32_t TypeSignature;
-	void (*Destroy)(uiControl *);
-	uintptr_t (*Handle)(uiControl *);
-	uiControl *(*Parent)(uiControl *);
-	void (*SetParent)(uiControl *, uiControl *);
-	int (*Toplevel)(uiControl *);
-	int (*Visible)(uiControl *);
-	void (*Show)(uiControl *);
-	void (*Hide)(uiControl *);
-	int (*Enabled)(uiControl *);
-	void (*Enable)(uiControl *);
-	void (*Disable)(uiControl *);
-	void (*SetFocus)(uiControl *);
-	void (*SetMinSize)(uiControl*, int, int);
-
-	int MinWidth, MinHeight;
-
-	void* UserData;
-};
-// TOOD add argument names to all arguments
-#define uiControl(this) ((uiControl *) (this))
-_UI_EXTERN void uiControlDestroy(uiControl *);
-_UI_EXTERN uintptr_t uiControlHandle(uiControl *);
-_UI_EXTERN uiControl *uiControlParent(uiControl *);
-_UI_EXTERN void uiControlSetParent(uiControl *, uiControl *);
-_UI_EXTERN int uiControlToplevel(uiControl *);
-_UI_EXTERN int uiControlVisible(uiControl *);
-_UI_EXTERN void uiControlShow(uiControl *);
-_UI_EXTERN void uiControlHide(uiControl *);
-_UI_EXTERN int uiControlEnabled(uiControl *);
-_UI_EXTERN void uiControlEnable(uiControl *);
-_UI_EXTERN void uiControlDisable(uiControl *);
-_UI_EXTERN void uiControlSetFocus(uiControl *);
-_UI_EXTERN void uiControlSetMinSize(uiControl *, int w, int h); // -1 = no minimum
-
-_UI_EXTERN uiControl *uiAllocControl(size_t n, uint32_t OSsig, uint32_t typesig, const char *typenamestr);
-_UI_EXTERN void uiFreeControl(uiControl *);
-
-// TODO make sure all controls have these
-_UI_EXTERN void uiControlVerifySetParent(uiControl *, uiControl *);
-_UI_EXTERN int uiControlEnabledToUser(uiControl *);
-
-_UI_EXTERN void uiUserBugCannotSetParentOnToplevel(const char *type);
-
-typedef struct uiWindow uiWindow;
-#define uiWindow(this) ((uiWindow *) (this))
-_UI_EXTERN char *uiWindowTitle(uiWindow *w);
-_UI_EXTERN void uiWindowSetTitle(uiWindow *w, const char *title);
-_UI_EXTERN void uiWindowPosition(uiWindow *w, int *x, int *y);
-_UI_EXTERN void uiWindowSetPosition(uiWindow *w, int x, int y);
-_UI_EXTERN void uiWindowContentSize(uiWindow *w, int *width, int *height);
-_UI_EXTERN void uiWindowSetContentSize(uiWindow *w, int width, int height);
-_UI_EXTERN int uiWindowMinimized(uiWindow *w);
-_UI_EXTERN void uiWindowSetMinimized(uiWindow *w, int minimized);
-_UI_EXTERN int uiWindowMaximized(uiWindow *w);
-_UI_EXTERN void uiWindowSetMaximized(uiWindow *w, int maximized);
-_UI_EXTERN int uiWindowFullscreen(uiWindow *w);
-_UI_EXTERN void uiWindowSetFullscreen(uiWindow *w, int fullscreen);
-_UI_EXTERN int uiWindowBorderless(uiWindow *w);
-_UI_EXTERN void uiWindowSetBorderless(uiWindow *w, int borderless);
-_UI_EXTERN void uiWindowSetChild(uiWindow *w, uiControl *child);
-_UI_EXTERN int uiWindowMargined(uiWindow *w);
-_UI_EXTERN void uiWindowSetMargined(uiWindow *w, int margined);
-_UI_EXTERN void uiWindowSetDropTarget(uiWindow* w, int drop);
-_UI_EXTERN uiWindow *uiNewWindow(const char *title, int width, int height, int maximized, int hasMenubar, int resizable);
-
-_UI_EXTERN void uiWindowOnContentSizeChanged(uiWindow *w, void (*f)(uiWindow *, void *), void *data);
-_UI_EXTERN void uiWindowOnClosing(uiWindow *w, int (*f)(uiWindow *w, void *data), void *data);
-_UI_EXTERN void uiWindowOnDropFile(uiWindow *w, void (*f)(uiWindow *w, char *file, void *data), void *data);
-_UI_EXTERN void uiWindowOnGetFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data);
-_UI_EXTERN void uiWindowOnLoseFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data);
-
-typedef struct uiButton uiButton;
-#define uiButton(this) ((uiButton *) (this))
-_UI_EXTERN char *uiButtonText(uiButton *b);
-_UI_EXTERN void uiButtonSetText(uiButton *b, const char *text);
-_UI_EXTERN void uiButtonOnClicked(uiButton *b, void (*f)(uiButton *b, void *data), void *data);
-_UI_EXTERN uiButton *uiNewButton(const char *text);
-
-typedef struct uiBox uiBox;
-#define uiBox(this) ((uiBox *) (this))
-_UI_EXTERN void uiBoxAppend(uiBox *b, uiControl *child, int stretchy);
-_UI_EXTERN void uiBoxDelete(uiBox *b, int index);
-_UI_EXTERN int uiBoxPadded(uiBox *b);
-_UI_EXTERN void uiBoxSetPadded(uiBox *b, int padded);
-_UI_EXTERN uiBox *uiNewHorizontalBox(void);
-_UI_EXTERN uiBox *uiNewVerticalBox(void);
-
-typedef struct uiCheckbox uiCheckbox;
-#define uiCheckbox(this) ((uiCheckbox *) (this))
-_UI_EXTERN char *uiCheckboxText(uiCheckbox *c);
-_UI_EXTERN void uiCheckboxSetText(uiCheckbox *c, const char *text);
-_UI_EXTERN void uiCheckboxOnToggled(uiCheckbox *c, void (*f)(uiCheckbox *c, void *data), void *data);
-_UI_EXTERN int uiCheckboxChecked(uiCheckbox *c);
-_UI_EXTERN void uiCheckboxSetChecked(uiCheckbox *c, int checked);
-_UI_EXTERN uiCheckbox *uiNewCheckbox(const char *text);
-
-typedef struct uiEntry uiEntry;
-#define uiEntry(this) ((uiEntry *) (this))
-_UI_EXTERN char *uiEntryText(uiEntry *e);
-_UI_EXTERN void uiEntrySetText(uiEntry *e, const char *text);
-_UI_EXTERN void uiEntryOnChanged(uiEntry *e, void (*f)(uiEntry *e, void *data), void *data);
-_UI_EXTERN int uiEntryReadOnly(uiEntry *e);
-_UI_EXTERN void uiEntrySetReadOnly(uiEntry *e, int readonly);
-_UI_EXTERN uiEntry *uiNewEntry(void);
-_UI_EXTERN uiEntry *uiNewPasswordEntry(void);
-_UI_EXTERN uiEntry *uiNewSearchEntry(void);
-
-typedef struct uiLabel uiLabel;
-#define uiLabel(this) ((uiLabel *) (this))
-_UI_EXTERN char *uiLabelText(uiLabel *l);
-_UI_EXTERN void uiLabelSetText(uiLabel *l, const char *text);
-_UI_EXTERN uiLabel *uiNewLabel(const char *text);
-
-typedef struct uiTab uiTab;
-#define uiTab(this) ((uiTab *) (this))
-_UI_EXTERN void uiTabAppend(uiTab *t, const char *name, uiControl *c);
-_UI_EXTERN void uiTabInsertAt(uiTab *t, const char *name, int before, uiControl *c);
-_UI_EXTERN void uiTabDelete(uiTab *t, int index);
-_UI_EXTERN int uiTabNumPages(uiTab *t);
-_UI_EXTERN int uiTabMargined(uiTab *t, int page);
-_UI_EXTERN void uiTabSetMargined(uiTab *t, int page, int margined);
-_UI_EXTERN uiTab *uiNewTab(void);
-
-typedef struct uiGroup uiGroup;
-#define uiGroup(this) ((uiGroup *) (this))
-_UI_EXTERN char *uiGroupTitle(uiGroup *g);
-_UI_EXTERN void uiGroupSetTitle(uiGroup *g, const char *title);
-_UI_EXTERN void uiGroupSetChild(uiGroup *g, uiControl *c);
-_UI_EXTERN int uiGroupMargined(uiGroup *g);
-_UI_EXTERN void uiGroupSetMargined(uiGroup *g, int margined);
-_UI_EXTERN uiGroup *uiNewGroup(const char *title);
-
-// spinbox/slider rules:
-// setting value outside of range will automatically clamp
-// initial value is minimum
-// complaint if min >= max?
-
-typedef struct uiSpinbox uiSpinbox;
-#define uiSpinbox(this) ((uiSpinbox *) (this))
-_UI_EXTERN int uiSpinboxValue(uiSpinbox *s);
-_UI_EXTERN void uiSpinboxSetValue(uiSpinbox *s, int value);
-_UI_EXTERN void uiSpinboxOnChanged(uiSpinbox *s, void (*f)(uiSpinbox *s, void *data), void *data);
-_UI_EXTERN uiSpinbox *uiNewSpinbox(int min, int max);
-
-typedef struct uiSlider uiSlider;
-#define uiSlider(this) ((uiSlider *) (this))
-_UI_EXTERN int uiSliderValue(uiSlider *s);
-_UI_EXTERN void uiSliderSetValue(uiSlider *s, int value);
-_UI_EXTERN void uiSliderOnChanged(uiSlider *s, void (*f)(uiSlider *s, void *data), void *data);
-_UI_EXTERN uiSlider *uiNewSlider(int min, int max);
-
-typedef struct uiProgressBar uiProgressBar;
-#define uiProgressBar(this) ((uiProgressBar *) (this))
-_UI_EXTERN int uiProgressBarValue(uiProgressBar *p);
-_UI_EXTERN void uiProgressBarSetValue(uiProgressBar *p, int n);
-_UI_EXTERN uiProgressBar *uiNewProgressBar(void);
-
-typedef struct uiSeparator uiSeparator;
-#define uiSeparator(this) ((uiSeparator *) (this))
-_UI_EXTERN uiSeparator *uiNewHorizontalSeparator(void);
-_UI_EXTERN uiSeparator *uiNewVerticalSeparator(void);
-
-typedef struct uiCombobox uiCombobox;
-#define uiCombobox(this) ((uiCombobox *) (this))
-_UI_EXTERN void uiComboboxAppend(uiCombobox *c, const char *text);
-_UI_EXTERN int uiComboboxSelected(uiCombobox *c);
-_UI_EXTERN void uiComboboxSetSelected(uiCombobox *c, int n);
-_UI_EXTERN void uiComboboxOnSelected(uiCombobox *c, void (*f)(uiCombobox *c, void *data), void *data);
-_UI_EXTERN uiCombobox *uiNewCombobox(void);
-
-typedef struct uiEditableCombobox uiEditableCombobox;
-#define uiEditableCombobox(this) ((uiEditableCombobox *) (this))
-_UI_EXTERN void uiEditableComboboxAppend(uiEditableCombobox *c, const char *text);
-_UI_EXTERN char *uiEditableComboboxText(uiEditableCombobox *c);
-_UI_EXTERN void uiEditableComboboxSetText(uiEditableCombobox *c, const char *text);
-// TODO what do we call a function that sets the currently selected item and fills the text field with it? editable comboboxes have no consistent concept of selected item
-_UI_EXTERN void uiEditableComboboxOnChanged(uiEditableCombobox *c, void (*f)(uiEditableCombobox *c, void *data), void *data);
-_UI_EXTERN uiEditableCombobox *uiNewEditableCombobox(void);
-
-typedef struct uiRadioButtons uiRadioButtons;
-#define uiRadioButtons(this) ((uiRadioButtons *) (this))
-_UI_EXTERN void uiRadioButtonsAppend(uiRadioButtons *r, const char *text);
-_UI_EXTERN int uiRadioButtonsSelected(uiRadioButtons *r);
-_UI_EXTERN void uiRadioButtonsSetSelected(uiRadioButtons *r, int n);
-_UI_EXTERN void uiRadioButtonsOnSelected(uiRadioButtons *r, void (*f)(uiRadioButtons *, void *), void *data);
-_UI_EXTERN uiRadioButtons *uiNewRadioButtons(void);
-
-typedef struct uiDateTimePicker uiDateTimePicker;
-#define uiDateTimePicker(this) ((uiDateTimePicker *) (this))
-_UI_EXTERN uiDateTimePicker *uiNewDateTimePicker(void);
-_UI_EXTERN uiDateTimePicker *uiNewDatePicker(void);
-_UI_EXTERN uiDateTimePicker *uiNewTimePicker(void);
-
-// TODO provide a facility for entering tab stops?
-typedef struct uiMultilineEntry uiMultilineEntry;
-#define uiMultilineEntry(this) ((uiMultilineEntry *) (this))
-_UI_EXTERN char *uiMultilineEntryText(uiMultilineEntry *e);
-_UI_EXTERN void uiMultilineEntrySetText(uiMultilineEntry *e, const char *text);
-_UI_EXTERN void uiMultilineEntryAppend(uiMultilineEntry *e, const char *text);
-_UI_EXTERN void uiMultilineEntryOnChanged(uiMultilineEntry *e, void (*f)(uiMultilineEntry *e, void *data), void *data);
-_UI_EXTERN int uiMultilineEntryReadOnly(uiMultilineEntry *e);
-_UI_EXTERN void uiMultilineEntrySetReadOnly(uiMultilineEntry *e, int readonly);
-_UI_EXTERN uiMultilineEntry *uiNewMultilineEntry(void);
-_UI_EXTERN uiMultilineEntry *uiNewNonWrappingMultilineEntry(void);
-
-typedef struct uiMenuItem uiMenuItem;
-#define uiMenuItem(this) ((uiMenuItem *) (this))
-_UI_EXTERN void uiMenuItemEnable(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemDisable(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemOnClicked(uiMenuItem *m, void (*f)(uiMenuItem *sender, uiWindow *window, void *data), void *data);
-_UI_EXTERN int uiMenuItemChecked(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemSetChecked(uiMenuItem *m, int checked);
-
-typedef struct uiMenu uiMenu;
-#define uiMenu(this) ((uiMenu *) (this))
-_UI_EXTERN uiMenuItem *uiMenuAppendItem(uiMenu *m, const char *name);
-_UI_EXTERN uiMenuItem *uiMenuAppendCheckItem(uiMenu *m, const char *name);
-_UI_EXTERN uiMenuItem *uiMenuAppendQuitItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendPreferencesItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendAboutItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendSubmenu(uiMenu *m, uiMenu* child);
-_UI_EXTERN void uiMenuAppendSeparator(uiMenu *m);
-_UI_EXTERN uiMenu *uiNewMenu(const char *name);
-
-_UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath);
-_UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath);
-_UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description);
-_UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description);
-_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description);
-
-typedef struct uiArea uiArea;
-typedef struct uiAreaHandler uiAreaHandler;
-typedef struct uiAreaDrawParams uiAreaDrawParams;
-typedef struct uiAreaMouseEvent uiAreaMouseEvent;
-typedef struct uiAreaKeyEvent uiAreaKeyEvent;
-
-typedef struct uiDrawContext uiDrawContext;
-
-// TO CONSIDER: the uiAreaHandler param there seems useless
-// (might use individual callbacks instead of handler struct?)
-struct uiAreaHandler {
-	void (*Draw)(uiAreaHandler *, uiArea *, uiAreaDrawParams *);
-	// TODO document that resizes cause a full redraw for non-scrolling areas; implementation-defined for scrolling areas
-	void (*MouseEvent)(uiAreaHandler *, uiArea *, uiAreaMouseEvent *);
-	// TODO document that on first show if the mouse is already in the uiArea then one gets sent with left=0
-	// TODO what about when the area is hidden and then shown again?
-	void (*MouseCrossed)(uiAreaHandler *, uiArea *, int left);
-	void (*DragBroken)(uiAreaHandler *, uiArea *);
-	int (*KeyEvent)(uiAreaHandler *, uiArea *, uiAreaKeyEvent *);
-	void (*Resize)(uiAreaHandler *, uiArea *, int, int);
-};
-
-// TODO RTL layouts?
-// TODO reconcile edge and corner naming
-_UI_ENUM(uiWindowResizeEdge) {
-	uiWindowResizeEdgeLeft,
-	uiWindowResizeEdgeTop,
-	uiWindowResizeEdgeRight,
-	uiWindowResizeEdgeBottom,
-	uiWindowResizeEdgeTopLeft,
-	uiWindowResizeEdgeTopRight,
-	uiWindowResizeEdgeBottomLeft,
-	uiWindowResizeEdgeBottomRight,
-	// TODO have one for keyboard resizes?
-	// TODO GDK doesn't seem to have any others, including for keyboards...
-	// TODO way to bring up the system menu instead?
-};
-
-#define uiGLVersion(major, minor)  ((major) | ((minor)<<16))
-#define uiGLVerMajor(ver)          ((ver) & 0xFFFF)
-#define uiGLVerMinor(ver)          ((ver) >> 16)
-
-#define uiArea(this) ((uiArea *) (this))
-// TODO give a better name
-// TODO document the types of width and height
-_UI_EXTERN void uiAreaSetSize(uiArea *a, int width, int height);
-// TODO uiAreaQueueRedraw()
-_UI_EXTERN void uiAreaQueueRedrawAll(uiArea *a);
-_UI_EXTERN void uiAreaScrollTo(uiArea *a, double x, double y, double width, double height);
-// TODO document these can only be called within Mouse() handlers
-// TODO should these be allowed on scrolling areas?
-// TODO decide which mouse events should be accepted; Down is the only one guaranteed to work right now
-// TODO what happens to events after calling this up to and including the next mouse up?
-// TODO release capture?
-_UI_EXTERN void uiAreaBeginUserWindowMove(uiArea *a);
-_UI_EXTERN void uiAreaBeginUserWindowResize(uiArea *a, uiWindowResizeEdge edge);
-_UI_EXTERN void uiAreaSetBackgroundColor(uiArea *a, int r, int g, int b);
-_UI_EXTERN uiArea *uiNewArea(uiAreaHandler *ah);
-_UI_EXTERN uiArea *uiNewGLArea(uiAreaHandler *ah, const unsigned int* req_versions);
-_UI_EXTERN uiArea *uiNewScrollingArea(uiAreaHandler *ah, int width, int height);
-
-struct uiAreaDrawParams {
-	uiDrawContext *Context;
-
-	// TODO document that this is only defined for nonscrolling areas
-	double AreaWidth;
-	double AreaHeight;
-
-	double ClipX;
-	double ClipY;
-	double ClipWidth;
-	double ClipHeight;
-};
-
-typedef struct uiDrawPath uiDrawPath;
-typedef struct uiDrawBrush uiDrawBrush;
-typedef struct uiDrawStrokeParams uiDrawStrokeParams;
-typedef struct uiDrawMatrix uiDrawMatrix;
-
-typedef struct uiDrawBrushGradientStop uiDrawBrushGradientStop;
-
-typedef struct uiDrawBitmap uiDrawBitmap;
-
-_UI_ENUM(uiDrawBrushType) {
-	uiDrawBrushTypeSolid,
-	uiDrawBrushTypeLinearGradient,
-	uiDrawBrushTypeRadialGradient,
-	uiDrawBrushTypeImage,
-};
-
-_UI_ENUM(uiDrawLineCap) {
-	uiDrawLineCapFlat,
-	uiDrawLineCapRound,
-	uiDrawLineCapSquare,
-};
-
-_UI_ENUM(uiDrawLineJoin) {
-	uiDrawLineJoinMiter,
-	uiDrawLineJoinRound,
-	uiDrawLineJoinBevel,
-};
-
-// this is the default for botoh cairo and Direct2D (in the latter case, from the C++ helper functions)
-// Core Graphics doesn't explicitly specify a default, but NSBezierPath allows you to choose one, and this is the initial value
-// so we're good to use it too!
-#define uiDrawDefaultMiterLimit 10.0
-
-_UI_ENUM(uiDrawFillMode) {
-	uiDrawFillModeWinding,
-	uiDrawFillModeAlternate,
-};
-
-struct uiDrawMatrix {
-	double M11;
-	double M12;
-	double M21;
-	double M22;
-	double M31;
-	double M32;
-};
-
-struct uiDrawBrush {
-	uiDrawBrushType Type;
-
-	// solid brushes
-	double R;
-	double G;
-	double B;
-	double A;
-
-	// gradient brushes
-	double X0;		// linear: start X, radial: start X
-	double Y0;		// linear: start Y, radial: start Y
-	double X1;		// linear: end X, radial: outer circle center X
-	double Y1;		// linear: end Y, radial: outer circle center Y
-	double OuterRadius;		// radial gradients only
-	uiDrawBrushGradientStop *Stops;
-	size_t NumStops;
-	// TODO extend mode
-	// cairo: none, repeat, reflect, pad; no individual control
-	// Direct2D: repeat, reflect, pad; no individual control
-	// Core Graphics: none, pad; before and after individually
-	// TODO cairo documentation is inconsistent about pad
-
-	// TODO images
-
-	// TODO transforms
-};
-
-struct uiDrawBrushGradientStop {
-	double Pos;
-	double R;
-	double G;
-	double B;
-	double A;
-};
-
-struct uiDrawStrokeParams {
-	uiDrawLineCap Cap;
-	uiDrawLineJoin Join;
-	// TODO what if this is 0? on windows there will be a crash with dashing
-	double Thickness;
-	double MiterLimit;
-	double *Dashes;
-	// TOOD what if this is 1 on Direct2D?
-	// TODO what if a dash is 0 on Cairo or Quartz?
-	size_t NumDashes;
-	double DashPhase;
-};
-
-struct uiRect {
-    int X;
-    int Y;
-    int Width;
-    int Height;
-};
-
-typedef struct uiRect uiRect;
-
-_UI_EXTERN uiDrawPath *uiDrawNewPath(uiDrawFillMode fillMode);
-_UI_EXTERN void uiDrawFreePath(uiDrawPath *p);
-
-_UI_EXTERN void uiDrawPathNewFigure(uiDrawPath *p, double x, double y);
-_UI_EXTERN void uiDrawPathNewFigureWithArc(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative);
-_UI_EXTERN void uiDrawPathLineTo(uiDrawPath *p, double x, double y);
-// notes: angles are both relative to 0 and go counterclockwise
-// TODO is the initial line segment on cairo and OS X a proper join?
-// TODO what if sweep < 0?
-_UI_EXTERN void uiDrawPathArcTo(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative);
-_UI_EXTERN void uiDrawPathBezierTo(uiDrawPath *p, double c1x, double c1y, double c2x, double c2y, double endX, double endY);
-// TODO quadratic bezier
-_UI_EXTERN void uiDrawPathCloseFigure(uiDrawPath *p);
-
-// TODO effect of these when a figure is already started
-_UI_EXTERN void uiDrawPathAddRectangle(uiDrawPath *p, double x, double y, double width, double height);
-
-_UI_EXTERN void uiDrawPathEnd(uiDrawPath *p);
-
-_UI_EXTERN void uiDrawStroke(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b, uiDrawStrokeParams *p);
-_UI_EXTERN void uiDrawFill(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b);
-
-// TODO primitives:
-// - rounded rectangles
-// - elliptical arcs
-// - quadratic bezier curves
-
-_UI_EXTERN void uiDrawMatrixSetIdentity(uiDrawMatrix *m);
-_UI_EXTERN void uiDrawMatrixTranslate(uiDrawMatrix *m, double x, double y);
-_UI_EXTERN void uiDrawMatrixScale(uiDrawMatrix *m, double xCenter, double yCenter, double x, double y);
-_UI_EXTERN void uiDrawMatrixRotate(uiDrawMatrix *m, double x, double y, double amount);
-_UI_EXTERN void uiDrawMatrixSkew(uiDrawMatrix *m, double x, double y, double xamount, double yamount);
-_UI_EXTERN void uiDrawMatrixMultiply(uiDrawMatrix *dest, uiDrawMatrix *src);
-_UI_EXTERN int uiDrawMatrixInvertible(uiDrawMatrix *m);
-_UI_EXTERN int uiDrawMatrixInvert(uiDrawMatrix *m);
-_UI_EXTERN void uiDrawMatrixTransformPoint(uiDrawMatrix *m, double *x, double *y);
-_UI_EXTERN void uiDrawMatrixTransformSize(uiDrawMatrix *m, double *x, double *y);
-
-_UI_EXTERN void uiDrawTransform(uiDrawContext *c, uiDrawMatrix *m);
-
-// TODO add a uiDrawPathStrokeToFill() or something like that
-_UI_EXTERN void uiDrawClip(uiDrawContext *c, uiDrawPath *path);
-
-_UI_EXTERN void uiDrawSave(uiDrawContext *c);
-_UI_EXTERN void uiDrawRestore(uiDrawContext *c);
-
-// bitmap API
-_UI_EXTERN uiDrawBitmap* uiDrawNewBitmap(uiDrawContext* c, int width, int height, int alpha);
-_UI_EXTERN void uiDrawBitmapUpdate(uiDrawBitmap* bmp, const void* data);
-_UI_EXTERN void uiDrawBitmapDraw(uiDrawContext* c, uiDrawBitmap* bmp, uiRect* srcrect, uiRect* dstrect, int filter);
-_UI_EXTERN void uiDrawFreeBitmap(uiDrawBitmap* bmp);
-
-// TODO manage the use of Text, Font, and TextFont, and of the uiDrawText prefix in general
-
-///// TODO reconsider this
-typedef struct uiDrawFontFamilies uiDrawFontFamilies;
-
-_UI_EXTERN uiDrawFontFamilies *uiDrawListFontFamilies(void);
-_UI_EXTERN int uiDrawFontFamiliesNumFamilies(uiDrawFontFamilies *ff);
-_UI_EXTERN char *uiDrawFontFamiliesFamily(uiDrawFontFamilies *ff, int n);
-_UI_EXTERN void uiDrawFreeFontFamilies(uiDrawFontFamilies *ff);
-///// END TODO
-
-typedef struct uiDrawTextLayout uiDrawTextLayout;
-typedef struct uiDrawTextFont uiDrawTextFont;
-typedef struct uiDrawTextFontDescriptor uiDrawTextFontDescriptor;
-typedef struct uiDrawTextFontMetrics uiDrawTextFontMetrics;
-
-_UI_ENUM(uiDrawTextWeight) {
-	uiDrawTextWeightThin,
-	uiDrawTextWeightUltraLight,
-	uiDrawTextWeightLight,
-	uiDrawTextWeightBook,
-	uiDrawTextWeightNormal,
-	uiDrawTextWeightMedium,
-	uiDrawTextWeightSemiBold,
-	uiDrawTextWeightBold,
-	uiDrawTextWeightUltraBold,
-	uiDrawTextWeightHeavy,
-	uiDrawTextWeightUltraHeavy,
-};
-
-_UI_ENUM(uiDrawTextItalic) {
-	uiDrawTextItalicNormal,
-	uiDrawTextItalicOblique,
-	uiDrawTextItalicItalic,
-};
-
-_UI_ENUM(uiDrawTextStretch) {
-	uiDrawTextStretchUltraCondensed,
-	uiDrawTextStretchExtraCondensed,
-	uiDrawTextStretchCondensed,
-	uiDrawTextStretchSemiCondensed,
-	uiDrawTextStretchNormal,
-	uiDrawTextStretchSemiExpanded,
-	uiDrawTextStretchExpanded,
-	uiDrawTextStretchExtraExpanded,
-	uiDrawTextStretchUltraExpanded,
-};
-
-struct uiDrawTextFontDescriptor {
-	const char *Family;
-	double Size;
-	uiDrawTextWeight Weight;
-	uiDrawTextItalic Italic;
-	uiDrawTextStretch Stretch;
-};
-
-struct uiDrawTextFontMetrics {
-	double Ascent;
-	double Descent;
-	double Leading;
-	// TODO do these two mean the same across all platforms?
-	double UnderlinePos;
-	double UnderlineThickness;
-};
-
-_UI_EXTERN uiDrawTextFont *uiDrawLoadClosestFont(const uiDrawTextFontDescriptor *desc);
-_UI_EXTERN void uiDrawFreeTextFont(uiDrawTextFont *font);
-_UI_EXTERN uintptr_t uiDrawTextFontHandle(uiDrawTextFont *font);
-_UI_EXTERN void uiDrawTextFontDescribe(uiDrawTextFont *font, uiDrawTextFontDescriptor *desc);
-// TODO make copy with given attributes methods?
-// TODO yuck this name
-_UI_EXTERN void uiDrawTextFontGetMetrics(uiDrawTextFont *font, uiDrawTextFontMetrics *metrics);
-
-// TODO initial line spacing? and what about leading?
-_UI_EXTERN uiDrawTextLayout *uiDrawNewTextLayout(const char *text, uiDrawTextFont *defaultFont, double width);
-_UI_EXTERN void uiDrawFreeTextLayout(uiDrawTextLayout *layout);
-// TODO get width
-_UI_EXTERN void uiDrawTextLayoutSetWidth(uiDrawTextLayout *layout, double width);
-_UI_EXTERN void uiDrawTextLayoutExtents(uiDrawTextLayout *layout, double *width, double *height);
-
-// and the attributes that you can set on a text layout
-_UI_EXTERN void uiDrawTextLayoutSetColor(uiDrawTextLayout *layout, int startChar, int endChar, double r, double g, double b, double a);
-
-_UI_EXTERN void uiDrawText(uiDrawContext *c, double x, double y, uiDrawTextLayout *layout);
-
-
-// OpenGL support
-
-typedef struct uiGLContext uiGLContext;
-
-_UI_EXTERN uiGLContext *uiAreaGetGLContext(uiArea* a);
-_UI_EXTERN void uiGLMakeContextCurrent(uiGLContext* ctx);
-_UI_EXTERN void uiGLBegin(uiGLContext* ctx);
-_UI_EXTERN void uiGLEnd(uiGLContext* ctx);
-_UI_EXTERN unsigned int uiGLGetVersion(uiGLContext* ctx);
-_UI_EXTERN void *uiGLGetProcAddress(const char* proc);
-_UI_EXTERN int uiGLGetFramebuffer(uiGLContext* ctx);
-_UI_EXTERN float uiGLGetFramebufferScale(uiGLContext* ctx);
-_UI_EXTERN void uiGLSwapBuffers(uiGLContext* ctx);
-_UI_EXTERN void uiGLSetVSync(int sync);
-
-
-_UI_ENUM(uiModifiers) {
-	uiModifierCtrl = 1 << 0,
-	uiModifierAlt = 1 << 1,
-	uiModifierShift = 1 << 2,
-	uiModifierSuper = 1 << 3,
-};
-
-// TODO document drag captures
-struct uiAreaMouseEvent {
-	// TODO document what these mean for scrolling areas
-	double X;
-	double Y;
-
-	// TODO see draw above
-	double AreaWidth;
-	double AreaHeight;
-
-	int Down;
-	int Up;
-
-	int Count;
-
-	uiModifiers Modifiers;
-
-	uint64_t Held1To64;
-};
-
-_UI_ENUM(uiExtKey) {
-	uiExtKeyEscape = 1,
-	uiExtKeyInsert,			// equivalent to "Help" on Apple keyboards
-	uiExtKeyDelete,
-	uiExtKeyHome,
-	uiExtKeyEnd,
-	uiExtKeyPageUp,
-	uiExtKeyPageDown,
-	uiExtKeyUp,
-	uiExtKeyDown,
-	uiExtKeyLeft,
-	uiExtKeyRight,
-	uiExtKeyF1,			// F1..F12 are guaranteed to be consecutive
-	uiExtKeyF2,
-	uiExtKeyF3,
-	uiExtKeyF4,
-	uiExtKeyF5,
-	uiExtKeyF6,
-	uiExtKeyF7,
-	uiExtKeyF8,
-	uiExtKeyF9,
-	uiExtKeyF10,
-	uiExtKeyF11,
-	uiExtKeyF12,
-	uiExtKeyN0,			// numpad keys; independent of Num Lock state
-	uiExtKeyN1,			// N0..N9 are guaranteed to be consecutive
-	uiExtKeyN2,
-	uiExtKeyN3,
-	uiExtKeyN4,
-	uiExtKeyN5,
-	uiExtKeyN6,
-	uiExtKeyN7,
-	uiExtKeyN8,
-	uiExtKeyN9,
-	uiExtKeyNDot,
-	uiExtKeyNEnter,
-	uiExtKeyNAdd,
-	uiExtKeyNSubtract,
-	uiExtKeyNMultiply,
-	uiExtKeyNDivide,
-};
-
-struct uiAreaKeyEvent {
-	char Key;
-	uiExtKey ExtKey;
-	uiModifiers Modifier;
-
-	uiModifiers Modifiers;
-
-	// additional things
-	int Scancode; // bit0-7: scancode, bit8: ext flag
-
-	int Up;
-	int Repeat;
-};
-
-typedef struct uiFontButton uiFontButton;
-#define uiFontButton(this) ((uiFontButton *) (this))
-// TODO document this returns a new font
-_UI_EXTERN uiDrawTextFont *uiFontButtonFont(uiFontButton *b);
-// TOOD SetFont, mechanics
-_UI_EXTERN void uiFontButtonOnChanged(uiFontButton *b, void (*f)(uiFontButton *, void *), void *data);
-_UI_EXTERN uiFontButton *uiNewFontButton(void);
-
-typedef struct uiColorButton uiColorButton;
-#define uiColorButton(this) ((uiColorButton *) (this))
-_UI_EXTERN void uiColorButtonColor(uiColorButton *b, double *r, double *g, double *bl, double *a);
-_UI_EXTERN void uiColorButtonSetColor(uiColorButton *b, double r, double g, double bl, double a);
-_UI_EXTERN void uiColorButtonOnChanged(uiColorButton *b, void (*f)(uiColorButton *, void *), void *data);
-_UI_EXTERN uiColorButton *uiNewColorButton(void);
-
-typedef struct uiForm uiForm;
-#define uiForm(this) ((uiForm *) (this))
-_UI_EXTERN void uiFormAppend(uiForm *f, const char *label, uiControl *c, int stretchy);
-_UI_EXTERN void uiFormDelete(uiForm *f, int index);
-_UI_EXTERN int uiFormPadded(uiForm *f);
-_UI_EXTERN void uiFormSetPadded(uiForm *f, int padded);
-_UI_EXTERN uiForm *uiNewForm(void);
-
-_UI_ENUM(uiAlign) {
-	uiAlignFill,
-	uiAlignStart,
-	uiAlignCenter,
-	uiAlignEnd,
-};
-
-_UI_ENUM(uiAt) {
-	uiAtLeading,
-	uiAtTop,
-	uiAtTrailing,
-	uiAtBottom,
-};
-
-typedef struct uiGrid uiGrid;
-#define uiGrid(this) ((uiGrid *) (this))
-_UI_EXTERN void uiGridAppend(uiGrid *g, uiControl *c, int left, int top, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign);
-_UI_EXTERN void uiGridInsertAt(uiGrid *g, uiControl *c, uiControl *existing, uiAt at, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign);
-_UI_EXTERN int uiGridPadded(uiGrid *g);
-_UI_EXTERN void uiGridSetPadded(uiGrid *g, int padded);
-_UI_EXTERN uiGrid *uiNewGrid(void);
-
-
-// misc.
-
-_UI_EXTERN char* uiKeyName(int scancode);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/libui_sdl/libui/unix/stddialogs.c b/src/libui_sdl/libui/unix/stddialogs.c
deleted file mode 100644
index 10c598d..0000000
--- a/src/libui_sdl/libui/unix/stddialogs.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// 26 june 2015
-#include "uipriv_unix.h"
-
-// LONGTERM figure out why, and describe, that this is the desired behavior
-// LONGTERM also point out that font and color buttons also work like this
-
-#define windowWindow(w) ((w)?(GTK_WINDOW(uiControlHandle(uiControl(w)))):NULL)
-
-static char *filedialog(GtkWindow *parent, GtkFileChooserAction mode, const gchar *confirm, const char* filter, const char* initpath)
-{
-	GtkWidget *fcd;
-	GtkFileChooser *fc;
-	gint response;
-	char *filename;
-
-	fcd = gtk_file_chooser_dialog_new(NULL, parent, mode,
-		"_Cancel", GTK_RESPONSE_CANCEL,
-		confirm, GTK_RESPONSE_ACCEPT,
-		NULL);
-	fc = GTK_FILE_CHOOSER(fcd);
-	
-	// filters
-	{
-		gchar _filter[256];
-        gchar* fp = &_filter[0]; int s = 0;
-        gchar* fname;
-        for (int i = 0; i < 255; i++)
-        {
-            if (filter[i] == '|' || filter[i] == '\0')
-            {
-                _filter[i] = '\0';
-                if (s & 1)
-                {
-					GtkFileFilter* filter = gtk_file_filter_new();
-					gtk_file_filter_set_name(filter, fname);
-					
-					for (gchar* j = fp; ; j++)
-					{
-						if (*j == ';')
-						{
-						    *j = '\0';
-							gtk_file_filter_add_pattern(filter, fp);
-							fp = j+1;
-						}
-						else if (*j == '\0')
-						{
-							gtk_file_filter_add_pattern(filter, fp);
-							break;
-						}
-					}
-
-					gtk_file_chooser_add_filter(fc, filter);
-                }
-                else
-                {
-                    fname = fp;
-                }
-                fp = &_filter[i+1];
-                s++;
-                if (s >= 8) break;
-                if (filter[i] == '\0') break;
-            }
-            else
-                _filter[i] = filter[i];
-        }
-	}
-	
-	gtk_file_chooser_set_local_only(fc, FALSE);
-	gtk_file_chooser_set_select_multiple(fc, FALSE);
-	gtk_file_chooser_set_show_hidden(fc, TRUE);
-	gtk_file_chooser_set_do_overwrite_confirmation(fc, TRUE);
-	gtk_file_chooser_set_create_folders(fc, TRUE);
-	if (initpath && strlen(initpath)>0) 
-	    gtk_file_chooser_set_current_folder(fc, initpath);
-	
-	response = gtk_dialog_run(GTK_DIALOG(fcd));
-	if (response != GTK_RESPONSE_ACCEPT) {
-		gtk_widget_destroy(fcd);
-		return NULL;
-	}
-	filename = uiUnixStrdupText(gtk_file_chooser_get_filename(fc));
-	gtk_widget_destroy(fcd);
-	return filename;
-}
-
-char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_OPEN, "_Open", filter, initpath);
-}
-
-char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_SAVE, "_Save", filter, initpath);
-}
-
-static int msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons)
-{
-	GtkWidget *md;
-
-	md = gtk_message_dialog_new(parent, GTK_DIALOG_MODAL,
-		type, buttons,
-		"%s", title);
-	gtk_message_dialog_format_secondary_text(GTK_MESSAGE_DIALOG(md), "%s", description);
-	int result = gtk_dialog_run(GTK_DIALOG(md));
-	gtk_widget_destroy(md);
-
-	return result;
-}
-
-void uiMsgBox(uiWindow *parent, const char *title, const char *description)
-{
-	msgbox(windowWindow(parent), title, description, GTK_MESSAGE_OTHER, GTK_BUTTONS_OK);
-}
-
-void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
-{
-	msgbox(windowWindow(parent), title, description, GTK_MESSAGE_ERROR, GTK_BUTTONS_OK);
-}
-
-int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
-{
-	int result =
-		msgbox(windowWindow(parent), title, description, GTK_MESSAGE_QUESTION, GTK_BUTTONS_OK_CANCEL);
-
-	return result == GTK_RESPONSE_OK;
-}
\ No newline at end of file
diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp
deleted file mode 100644
index 7537015..0000000
--- a/src/libui_sdl/libui/windows/stddialogs.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-// 22 may 2015
-#include "uipriv_windows.hpp"
-
-// TODO document all this is what we want
-// TODO do the same for font and color buttons
-
-// notes:
-// - FOS_SUPPORTSTREAMABLEITEMS doesn't seem to be supported on windows vista, or at least not with the flags we use
-// - even with FOS_NOVALIDATE the dialogs will reject invalid filenames (at least on Vista, anyway)
-// - lack of FOS_NOREADONLYRETURN doesn't seem to matter on Windows 7
-
-// TODO
-// - http://blogs.msdn.com/b/wpfsdk/archive/2006/10/26/uncommon-dialogs--font-chooser-and-color-picker-dialogs.aspx
-// - when a dialog is active, tab navigation in other windows stops working
-// - when adding uiOpenFolder(), use IFileDialog as well - https://msdn.microsoft.com/en-us/library/windows/desktop/bb762115%28v=vs.85%29.aspx
-
-#define windowHWND(w) (w ? (HWND) uiControlHandle(uiControl(w)) : NULL)
-
-char *commonItemDialog(HWND parent, REFCLSID clsid, REFIID iid, const char* filter, const char* initpath, FILEOPENDIALOGOPTIONS optsadd)
-{
-	IFileDialog *d = NULL;
-	FILEOPENDIALOGOPTIONS opts;
-	IShellItem *result = NULL;
-	WCHAR *wname = NULL;
-	char *name = NULL;
-	HRESULT hr;
-
-	hr = CoCreateInstance(clsid,
-		NULL, CLSCTX_INPROC_SERVER,
-		iid, (LPVOID *) (&d));
-	if (hr != S_OK) {
-		logHRESULT(L"error creating common item dialog", hr);
-		// always return NULL on error
-		goto out;
-	}
-	hr = d->GetOptions(&opts);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting current options", hr);
-		goto out;
-	}
-	opts |= optsadd;
-	// the other platforms don't check read-only; we won't either
-	opts &= ~FOS_NOREADONLYRETURN;
-	hr = d->SetOptions(opts);
-	if (hr != S_OK) {
-		logHRESULT(L"error setting options", hr);
-		goto out;
-	}
-
-	// filters
-	{
-        COMDLG_FILTERSPEC filterspec[8];
-        wchar_t _filter[256];
-        wchar_t* fp = &_filter[0]; int s = 0;
-        wchar_t* fname;
-        for (int i = 0; i < 255; i++)
-        {
-            if (filter[i] == '|' || filter[i] == '\0')
-            {
-                _filter[i] = '\0';
-                if (s & 1)
-                {
-                    filterspec[s>>1].pszName = fname;
-                    filterspec[s>>1].pszSpec = fp;
-                }
-                else
-                {
-                    fname = fp;
-                }
-                fp = &_filter[i+1];
-                s++;
-                if (s >= 8) break;
-                if (filter[i] == '\0') break;
-            }
-            else
-                _filter[i] = filter[i];
-        }
-        d->SetFileTypes(s>>1, filterspec);
-	}
-
-	hr = d->Show(parent);
-	if (hr == HRESULT_FROM_WIN32(ERROR_CANCELLED))
-		// cancelled; return NULL like we have ready
-		goto out;
-	if (hr != S_OK) {
-		logHRESULT(L"error showing dialog", hr);
-		goto out;
-	}
-	hr = d->GetResult(&result);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting dialog result", hr);
-		goto out;
-	}
-	hr = result->GetDisplayName(SIGDN_FILESYSPATH, &wname);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting filename", hr);
-		goto out;
-	}
-	name = toUTF8(wname);
-
-out:
-	if (wname != NULL)
-		CoTaskMemFree(wname);
-	if (result != NULL)
-		result->Release();
-	if (d != NULL)
-		d->Release();
-	return name;
-}
-
-char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	char *res;
-
-	disableAllWindowsExcept(parent);
-	res = commonItemDialog(windowHWND(parent),
-		CLSID_FileOpenDialog, IID_IFileOpenDialog,
-		filter, initpath,
-		FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_PATHMUSTEXIST | FOS_FILEMUSTEXIST | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE);
-	enableAllWindowsExcept(parent);
-	return res;
-}
-
-char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	char *res;
-
-	disableAllWindowsExcept(parent);
-	res = commonItemDialog(windowHWND(parent),
-		CLSID_FileSaveDialog, IID_IFileSaveDialog,
-		filter, initpath,
-		FOS_OVERWRITEPROMPT | FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE);
-	enableAllWindowsExcept(parent);
-	return res;
-}
-
-// TODO switch to TaskDialogIndirect()?
-
-static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon)
-{
-	WCHAR *wtitle, *wdescription;
-	HRESULT hr;
-
-	wtitle = toUTF16(title);
-	wdescription = toUTF16(description);
-
-	int result;
-	hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result);
-	if (hr != S_OK)
-		logHRESULT(L"error showing task dialog", hr);
-
-	uiFree(wdescription);
-	uiFree(wtitle);
-
-	return result;
-}
-
-void uiMsgBox(uiWindow *parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, NULL);
-	enableAllWindowsExcept(parent);
-}
-
-void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON);
-	enableAllWindowsExcept(parent);
-}
-
-int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	int result =
-		msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON);
-	enableAllWindowsExcept(parent);
-
-	return result == IDOK;
-}
\ No newline at end of file
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
deleted file mode 100644
index 0066668..0000000
--- a/src/libui_sdl/main.cpp
+++ /dev/null
@@ -1,3061 +0,0 @@
-/*
-    Copyright 2016-2020 Arisotura
-
-    This file is part of melonDS.
-
-    melonDS is free software: you can redistribute it and/or modify it under
-    the terms of the GNU General Public License as published by the Free
-    Software Foundation, either version 3 of the License, or (at your option)
-    any later version.
-
-    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
-    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with melonDS. If not, see http://www.gnu.org/licenses/.
-*/
-
-#include <stdlib.h>
-#include <time.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifndef __WIN32__
-#include <glib.h>
-#endif
-
-#include <SDL2/SDL.h>
-#include "libui/ui.h"
-
-#include "../OpenGLSupport.h"
-#include "main_shaders.h"
-
-#include "../types.h"
-#include "../version.h"
-#include "PlatformConfig.h"
-
-#include "DlgEmuSettings.h"
-#include "DlgInputConfig.h"
-#include "DlgVideoSettings.h"
-#include "DlgAudioSettings.h"
-#include "DlgWifiSettings.h"
-
-#include "../NDS.h"
-#include "../GBACart.h"
-#include "../GPU.h"
-#include "../SPU.h"
-#include "../Wifi.h"
-#include "../Platform.h"
-#include "../Config.h"
-#include "../ARMJIT.h"
-
-#include "../Savestate.h"
-
-#include "OSD.h"
-
-#ifdef MELONCAP
-#include "MelonCap.h"
-#endif // MELONCAP
-
-
-// savestate slot mapping
-// 1-8: regular slots (quick access)
-// '9': load/save arbitrary file
-const int kSavestateNum[9] = {1, 2, 3, 4, 5, 6, 7, 8, 0};
-
-const int kScreenSize[4] = {1, 2, 3, 4};
-const int kScreenRot[4] = {0, 1, 2, 3};
-const int kScreenGap[6] = {0, 1, 8, 64, 90, 128};
-const int kScreenLayout[3] = {0, 1, 2};
-const int kScreenSizing[4] = {0, 1, 2, 3};
-
-
-char* EmuDirectory;
-
-
-uiWindow* MainWindow;
-uiArea* MainDrawArea;
-uiAreaHandler MainDrawAreaHandler;
-
-const u32 kGLVersions[] = {uiGLVersion(3,2), uiGLVersion(3,1), 0};
-uiGLContext* GLContext;
-
-int WindowWidth, WindowHeight;
-
-uiMenuItem* MenuItem_SaveState;
-uiMenuItem* MenuItem_LoadState;
-uiMenuItem* MenuItem_UndoStateLoad;
-
-uiMenuItem* MenuItem_SaveStateSlot[9];
-uiMenuItem* MenuItem_LoadStateSlot[9];
-
-uiMenuItem* MenuItem_Pause;
-uiMenuItem* MenuItem_Reset;
-uiMenuItem* MenuItem_Stop;
-
-uiMenuItem* MenuItem_SavestateSRAMReloc;
-
-uiMenuItem* MenuItem_ScreenRot[4];
-uiMenuItem* MenuItem_ScreenGap[6];
-uiMenuItem* MenuItem_ScreenLayout[3];
-uiMenuItem* MenuItem_ScreenSizing[4];
-
-uiMenuItem* MenuItem_ScreenFilter;
-uiMenuItem* MenuItem_LimitFPS;
-uiMenuItem* MenuItem_AudioSync;
-uiMenuItem* MenuItem_ShowOSD;
-
-SDL_Thread* EmuThread;
-int EmuRunning;
-volatile int EmuStatus;
-
-bool RunningSomething;
-char ROMPath[2][1024];
-char SRAMPath[2][1024];
-char PrevSRAMPath[2][1024]; // for savestate 'undo load'
-
-bool SavestateLoaded;
-
-bool Screen_UseGL;
-
-bool ScreenDrawInited = false;
-uiDrawBitmap* ScreenBitmap[2] = {NULL,NULL};
-
-GLuint GL_ScreenShader[3];
-GLuint GL_ScreenShaderAccel[3];
-GLuint GL_ScreenShaderOSD[3];
-struct
-{
-    float uScreenSize[2];
-    u32 u3DScale;
-    u32 uFilterMode;
-
-} GL_ShaderConfig;
-GLuint GL_ShaderConfigUBO;
-GLuint GL_ScreenVertexArrayID, GL_ScreenVertexBufferID;
-float GL_ScreenVertices[2 * 3*2 * 4]; // position/texcoord
-GLuint GL_ScreenTexture;
-bool GL_ScreenSizeDirty;
-
-int GL_3DScale;
-
-bool GL_VSyncStatus;
-
-int ScreenGap = 0;
-int ScreenLayout = 0;
-int ScreenSizing = 0;
-int ScreenRotation = 0;
-
-int MainScreenPos[3];
-int AutoScreenSizing;
-
-uiRect TopScreenRect;
-uiRect BottomScreenRect;
-uiDrawMatrix TopScreenTrans;
-uiDrawMatrix BottomScreenTrans;
-
-bool Touching = false;
-
-u32 KeyInputMask, JoyInputMask;
-u32 KeyHotkeyMask, JoyHotkeyMask;
-u32 HotkeyMask, LastHotkeyMask;
-u32 HotkeyPress, HotkeyRelease;
-
-#define HotkeyDown(hk)     (HotkeyMask & (1<<(hk)))
-#define HotkeyPressed(hk)  (HotkeyPress & (1<<(hk)))
-#define HotkeyReleased(hk) (HotkeyRelease & (1<<(hk)))
-
-bool LidStatus;
-
-int JoystickID;
-SDL_Joystick* Joystick;
-
-int AudioFreq;
-float AudioSampleFrac;
-SDL_AudioDeviceID AudioDevice, MicDevice;
-
-SDL_cond* AudioSync;
-SDL_mutex* AudioSyncLock;
-
-u32 MicBufferLength = 2048;
-s16 MicBuffer[2048];
-u32 MicBufferReadPos, MicBufferWritePos;
-
-u32 MicWavLength;
-s16* MicWavBuffer;
-
-void SetupScreenRects(int width, int height);
-
-void TogglePause(void* blarg);
-void Reset(void* blarg);
-
-void SetupSRAMPath(int slot);
-
-void SaveState(int slot);
-void LoadState(int slot);
-void UndoStateLoad();
-void GetSavestateName(int slot, char* filename, int len);
-
-void CreateMainWindow(bool opengl);
-void DestroyMainWindow();
-void RecreateMainWindow(bool opengl);
-
-
-
-bool GLScreen_InitShader(GLuint* shader, const char* fs)
-{
-    if (!OpenGL_BuildShaderProgram(kScreenVS, fs, shader, "ScreenShader"))
-        return false;
-
-    glBindAttribLocation(shader[2], 0, "vPosition");
-    glBindAttribLocation(shader[2], 1, "vTexcoord");
-    glBindFragDataLocation(shader[2], 0, "oColor");
-
-    if (!OpenGL_LinkShaderProgram(shader))
-        return false;
-
-    GLuint uni_id;
-
-    uni_id = glGetUniformBlockIndex(shader[2], "uConfig");
-    glUniformBlockBinding(shader[2], uni_id, 16);
-
-    glUseProgram(shader[2]);
-    uni_id = glGetUniformLocation(shader[2], "ScreenTex");
-    glUniform1i(uni_id, 0);
-    uni_id = glGetUniformLocation(shader[2], "_3DTex");
-    glUniform1i(uni_id, 1);
-
-    return true;
-}
-
-bool GLScreen_InitOSDShader(GLuint* shader)
-{
-    if (!OpenGL_BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, shader, "ScreenShaderOSD"))
-        return false;
-
-    glBindAttribLocation(shader[2], 0, "vPosition");
-    glBindFragDataLocation(shader[2], 0, "oColor");
-
-    if (!OpenGL_LinkShaderProgram(shader))
-        return false;
-
-    GLuint uni_id;
-
-    uni_id = glGetUniformBlockIndex(shader[2], "uConfig");
-    glUniformBlockBinding(shader[2], uni_id, 16);
-
-    glUseProgram(shader[2]);
-    uni_id = glGetUniformLocation(shader[2], "OSDTex");
-    glUniform1i(uni_id, 0);
-
-    return true;
-}
-
-bool GLScreen_Init()
-{
-    GL_VSyncStatus = Config::ScreenVSync;
-
-    // TODO: consider using epoxy?
-    if (!OpenGL_Init())
-        return false;
-
-    const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string
-    const GLubyte* version = glGetString(GL_VERSION); // version as a string
-    printf("OpenGL: renderer: %s\n", renderer);
-    printf("OpenGL: version: %s\n", version);
-
-    if (!GLScreen_InitShader(GL_ScreenShader, kScreenFS))
-        return false;
-    if (!GLScreen_InitShader(GL_ScreenShaderAccel, kScreenFS_Accel))
-        return false;
-    if (!GLScreen_InitOSDShader(GL_ScreenShaderOSD))
-        return false;
-
-    memset(&GL_ShaderConfig, 0, sizeof(GL_ShaderConfig));
-
-    glGenBuffers(1, &GL_ShaderConfigUBO);
-    glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO);
-    glBufferData(GL_UNIFORM_BUFFER, sizeof(GL_ShaderConfig), &GL_ShaderConfig, GL_STATIC_DRAW);
-    glBindBufferBase(GL_UNIFORM_BUFFER, 16, GL_ShaderConfigUBO);
-
-    glGenBuffers(1, &GL_ScreenVertexBufferID);
-    glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(GL_ScreenVertices), NULL, GL_STATIC_DRAW);
-
-    glGenVertexArrays(1, &GL_ScreenVertexArrayID);
-    glBindVertexArray(GL_ScreenVertexArrayID);
-    glEnableVertexAttribArray(0); // position
-    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(0));
-    glEnableVertexAttribArray(1); // texcoord
-    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(2*4));
-
-    glGenTextures(1, &GL_ScreenTexture);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI, 256*3 + 1, 192*2, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, NULL);
-
-    GL_ScreenSizeDirty = true;
-
-    return true;
-}
-
-void GLScreen_DeInit()
-{
-    glDeleteTextures(1, &GL_ScreenTexture);
-
-    glDeleteVertexArrays(1, &GL_ScreenVertexArrayID);
-    glDeleteBuffers(1, &GL_ScreenVertexBufferID);
-
-    OpenGL_DeleteShaderProgram(GL_ScreenShader);
-    OpenGL_DeleteShaderProgram(GL_ScreenShaderAccel);
-    OpenGL_DeleteShaderProgram(GL_ScreenShaderOSD);
-}
-
-void GLScreen_DrawScreen()
-{
-    bool vsync = Config::ScreenVSync && !HotkeyDown(HK_FastForward);
-    if (vsync != GL_VSyncStatus)
-    {
-        GL_VSyncStatus = vsync;
-        uiGLSetVSync(vsync);
-    }
-
-    float scale = uiGLGetFramebufferScale(GLContext);
-
-    glBindFramebuffer(GL_FRAMEBUFFER, uiGLGetFramebuffer(GLContext));
-
-    if (GL_ScreenSizeDirty)
-    {
-        GL_ScreenSizeDirty = false;
-
-        GL_ShaderConfig.uScreenSize[0] = WindowWidth;
-        GL_ShaderConfig.uScreenSize[1] = WindowHeight;
-        GL_ShaderConfig.u3DScale = GL_3DScale;
-
-        glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO);
-        void* unibuf = glMapBuffer(GL_UNIFORM_BUFFER, GL_WRITE_ONLY);
-        if (unibuf) memcpy(unibuf, &GL_ShaderConfig, sizeof(GL_ShaderConfig));
-        glUnmapBuffer(GL_UNIFORM_BUFFER);
-
-        float scwidth, scheight;
-
-        float x0, y0, x1, y1;
-        float s0, s1, s2, s3;
-        float t0, t1, t2, t3;
-
-#define SETVERTEX(i, x, y, s, t) \
-    GL_ScreenVertices[4*(i) + 0] = x; \
-    GL_ScreenVertices[4*(i) + 1] = y; \
-    GL_ScreenVertices[4*(i) + 2] = s; \
-    GL_ScreenVertices[4*(i) + 3] = t;
-
-        x0 = TopScreenRect.X;
-        y0 = TopScreenRect.Y;
-        x1 = TopScreenRect.X + TopScreenRect.Width;
-        y1 = TopScreenRect.Y + TopScreenRect.Height;
-
-        scwidth = 256;
-        scheight = 192;
-
-        switch (ScreenRotation)
-        {
-        case 0:
-            s0 = 0; t0 = 0;
-            s1 = scwidth; t1 = 0;
-            s2 = 0; t2 = scheight;
-            s3 = scwidth; t3 = scheight;
-            break;
-
-        case 1:
-            s0 = 0; t0 = scheight;
-            s1 = 0; t1 = 0;
-            s2 = scwidth; t2 = scheight;
-            s3 = scwidth; t3 = 0;
-            break;
-
-        case 2:
-            s0 = scwidth; t0 = scheight;
-            s1 = 0; t1 = scheight;
-            s2 = scwidth; t2 = 0;
-            s3 = 0; t3 = 0;
-            break;
-
-        case 3:
-            s0 = scwidth; t0 = 0;
-            s1 = scwidth; t1 = scheight;
-            s2 = 0; t2 = 0;
-            s3 = 0; t3 = scheight;
-            break;
-        }
-
-        SETVERTEX(0, x0, y0, s0, t0);
-        SETVERTEX(1, x1, y1, s3, t3);
-        SETVERTEX(2, x1, y0, s1, t1);
-        SETVERTEX(3, x0, y0, s0, t0);
-        SETVERTEX(4, x0, y1, s2, t2);
-        SETVERTEX(5, x1, y1, s3, t3);
-
-        x0 = BottomScreenRect.X;
-        y0 = BottomScreenRect.Y;
-        x1 = BottomScreenRect.X + BottomScreenRect.Width;
-        y1 = BottomScreenRect.Y + BottomScreenRect.Height;
-
-        scwidth = 256;
-        scheight = 192;
-
-        switch (ScreenRotation)
-        {
-        case 0:
-            s0 = 0; t0 = 192;
-            s1 = scwidth; t1 = 192;
-            s2 = 0; t2 = 192+scheight;
-            s3 = scwidth; t3 = 192+scheight;
-            break;
-
-        case 1:
-            s0 = 0; t0 = 192+scheight;
-            s1 = 0; t1 = 192;
-            s2 = scwidth; t2 = 192+scheight;
-            s3 = scwidth; t3 = 192;
-            break;
-
-        case 2:
-            s0 = scwidth; t0 = 192+scheight;
-            s1 = 0; t1 = 192+scheight;
-            s2 = scwidth; t2 = 192;
-            s3 = 0; t3 = 192;
-            break;
-
-        case 3:
-            s0 = scwidth; t0 = 192;
-            s1 = scwidth; t1 = 192+scheight;
-            s2 = 0; t2 = 192;
-            s3 = 0; t3 = 192+scheight;
-            break;
-        }
-
-        SETVERTEX(6, x0, y0, s0, t0);
-        SETVERTEX(7, x1, y1, s3, t3);
-        SETVERTEX(8, x1, y0, s1, t1);
-        SETVERTEX(9, x0, y0, s0, t0);
-        SETVERTEX(10, x0, y1, s2, t2);
-        SETVERTEX(11, x1, y1, s3, t3);
-
-#undef SETVERTEX
-
-        glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-        glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(GL_ScreenVertices), GL_ScreenVertices);
-    }
-
-    glDisable(GL_DEPTH_TEST);
-    glDisable(GL_STENCIL_TEST);
-    glDisable(GL_BLEND);
-    glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
-
-    glViewport(0, 0, WindowWidth*scale, WindowHeight*scale);
-
-    if (GPU3D::Renderer == 0)
-        OpenGL_UseShaderProgram(GL_ScreenShader);
-    else
-        OpenGL_UseShaderProgram(GL_ScreenShaderAccel);
-
-    glClearColor(0, 0, 0, 1);
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    if (RunningSomething)
-    {
-        int frontbuf = GPU::FrontBuffer;
-        glActiveTexture(GL_TEXTURE0);
-        glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture);
-
-        if (GPU::Framebuffer[frontbuf][0] && GPU::Framebuffer[frontbuf][1])
-        {
-            if (GPU3D::Renderer == 0)
-            {
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]);
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]);
-            }
-            else
-            {
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]);
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]);
-            }
-        }
-
-        glActiveTexture(GL_TEXTURE1);
-        if (GPU3D::Renderer != 0)
-            GPU3D::GLRenderer::SetupAccelFrame();
-
-        glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-        glBindVertexArray(GL_ScreenVertexArrayID);
-        glDrawArrays(GL_TRIANGLES, 0, 4*3);
-    }
-
-    OpenGL_UseShaderProgram(GL_ScreenShaderOSD);
-    OSD::Update(true, NULL);
-
-    glFlush();
-    uiGLSwapBuffers(GLContext);
-}
-
-void MicLoadWav(char* name)
-{
-    SDL_AudioSpec format;
-    memset(&format, 0, sizeof(SDL_AudioSpec));
-
-    if (MicWavBuffer) delete[] MicWavBuffer;
-    MicWavBuffer = NULL;
-    MicWavLength = 0;
-
-    u8* buf;
-    u32 len;
-    if (!SDL_LoadWAV(name, &format, &buf, &len))
-        return;
-
-    const u64 dstfreq = 44100;
-
-    if (format.format == AUDIO_S16 || format.format == AUDIO_U16)
-    {
-        int srcinc = format.channels;
-        len /= (2 * srcinc);
-
-        MicWavLength = (len * dstfreq) / format.freq;
-        if (MicWavLength < 735) MicWavLength = 735;
-        MicWavBuffer = new s16[MicWavLength];
-
-        float res_incr = len / (float)MicWavLength;
-        float res_timer = 0;
-        int res_pos = 0;
-
-        for (int i = 0; i < MicWavLength; i++)
-        {
-            u16 val = ((u16*)buf)[res_pos];
-            if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000;
-
-            MicWavBuffer[i] = val;
-
-            res_timer += res_incr;
-            while (res_timer >= 1.0)
-            {
-                res_timer -= 1.0;
-                res_pos += srcinc;
-            }
-        }
-    }
-    else if (format.format == AUDIO_S8 || format.format == AUDIO_U8)
-    {
-        int srcinc = format.channels;
-        len /= srcinc;
-
-        MicWavLength = (len * dstfreq) / format.freq;
-        if (MicWavLength < 735) MicWavLength = 735;
-        MicWavBuffer = new s16[MicWavLength];
-
-        float res_incr = len / (float)MicWavLength;
-        float res_timer = 0;
-        int res_pos = 0;
-
-        for (int i = 0; i < MicWavLength; i++)
-        {
-            u16 val = buf[res_pos] << 8;
-            if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000;
-
-            MicWavBuffer[i] = val;
-
-            res_timer += res_incr;
-            while (res_timer >= 1.0)
-            {
-                res_timer -= 1.0;
-                res_pos += srcinc;
-            }
-        }
-    }
-    else
-        printf("bad WAV format %08X\n", format.format);
-
-    SDL_FreeWAV(buf);
-}
-
-void AudioCallback(void* data, Uint8* stream, int len)
-{
-    len /= (sizeof(s16) * 2);
-
-    // resample incoming audio to match the output sample rate
-
-    float f_len_in = (len * 32823.6328125) / (float)AudioFreq;
-    f_len_in += AudioSampleFrac;
-    int len_in = (int)floor(f_len_in);
-    AudioSampleFrac = f_len_in - len_in;
-
-    s16 buf_in[1024*2];
-    s16* buf_out = (s16*)stream;
-
-    int num_in;
-    int num_out = len;
-
-    SDL_LockMutex(AudioSyncLock);
-    num_in = SPU::ReadOutput(buf_in, len_in);
-    SDL_CondSignal(AudioSync);
-    SDL_UnlockMutex(AudioSyncLock);
-
-    if (num_in < 1)
-    {
-        memset(stream, 0, len*sizeof(s16)*2);
-        return;
-    }
-
-    int margin = 6;
-    if (num_in < len_in-margin)
-    {
-        int last = num_in-1;
-        if (last < 0) last = 0;
-
-        for (int i = num_in; i < len_in-margin; i++)
-            ((u32*)buf_in)[i] = ((u32*)buf_in)[last];
-
-        num_in = len_in-margin;
-    }
-
-    float res_incr = num_in / (float)num_out;
-    float res_timer = 0;
-    int res_pos = 0;
-
-    int volume = Config::AudioVolume;
-
-    for (int i = 0; i < len; i++)
-    {
-        buf_out[i*2  ] = (buf_in[res_pos*2  ] * volume) >> 8;
-        buf_out[i*2+1] = (buf_in[res_pos*2+1] * volume) >> 8;
-
-        /*s16 s_l = buf_in[res_pos*2  ];
-        s16 s_r = buf_in[res_pos*2+1];
-
-        float a = res_timer;
-        float b = 1.0 - a;
-        s_l = (s_l * a) + (buf_in[(res_pos-1)*2  ] * b);
-        s_r = (s_r * a) + (buf_in[(res_pos-1)*2+1] * b);
-
-        buf_out[i*2  ] = (s_l * volume) >> 8;
-        buf_out[i*2+1] = (s_r * volume) >> 8;*/
-
-        res_timer += res_incr;
-        while (res_timer >= 1.0)
-        {
-            res_timer -= 1.0;
-            res_pos++;
-        }
-    }
-}
-
-void MicCallback(void* data, Uint8* stream, int len)
-{
-    if (Config::MicInputType != 1) return;
-
-    s16* input = (s16*)stream;
-    len /= sizeof(s16);
-
-    if ((MicBufferWritePos + len) > MicBufferLength)
-    {
-        u32 len1 = MicBufferLength - MicBufferWritePos;
-        memcpy(&MicBuffer[MicBufferWritePos], &input[0], len1*sizeof(s16));
-        memcpy(&MicBuffer[0], &input[len1], (len - len1)*sizeof(s16));
-        MicBufferWritePos = len - len1;
-    }
-    else
-    {
-        memcpy(&MicBuffer[MicBufferWritePos], input, len*sizeof(s16));
-        MicBufferWritePos += len;
-    }
-}
-
-void FeedMicInput()
-{
-    int type = Config::MicInputType;
-    bool cmd = HotkeyDown(HK_Mic);
-
-    if ((type != 1 && !cmd) ||
-        (type == 1 && MicBufferLength == 0) ||
-        (type == 3 && MicWavBuffer == NULL))
-    {
-        type = 0;
-        MicBufferReadPos = 0;
-    }
-
-    switch (type)
-    {
-    case 0: // no mic
-        NDS::MicInputFrame(NULL, 0);
-        break;
-
-    case 1: // host mic
-        if ((MicBufferReadPos + 735) > MicBufferLength)
-        {
-            s16 tmp[735];
-            u32 len1 = MicBufferLength - MicBufferReadPos;
-            memcpy(&tmp[0], &MicBuffer[MicBufferReadPos], len1*sizeof(s16));
-            memcpy(&tmp[len1], &MicBuffer[0], (735 - len1)*sizeof(s16));
-
-            NDS::MicInputFrame(tmp, 735);
-            MicBufferReadPos = 735 - len1;
-        }
-        else
-        {
-            NDS::MicInputFrame(&MicBuffer[MicBufferReadPos], 735);
-            MicBufferReadPos += 735;
-        }
-        break;
-
-    case 2: // white noise
-        {
-            s16 tmp[735];
-            for (int i = 0; i < 735; i++) tmp[i] = rand() & 0xFFFF;
-            NDS::MicInputFrame(tmp, 735);
-        }
-        break;
-
-    case 3: // WAV
-        if ((MicBufferReadPos + 735) > MicWavLength)
-        {
-            s16 tmp[735];
-            u32 len1 = MicWavLength - MicBufferReadPos;
-            memcpy(&tmp[0], &MicWavBuffer[MicBufferReadPos], len1*sizeof(s16));
-            memcpy(&tmp[len1], &MicWavBuffer[0], (735 - len1)*sizeof(s16));
-
-            NDS::MicInputFrame(tmp, 735);
-            MicBufferReadPos = 735 - len1;
-        }
-        else
-        {
-            NDS::MicInputFrame(&MicWavBuffer[MicBufferReadPos], 735);
-            MicBufferReadPos += 735;
-        }
-        break;
-    }
-}
-
-void OpenJoystick()
-{
-    if (Joystick) SDL_JoystickClose(Joystick);
-
-    int num = SDL_NumJoysticks();
-    if (num < 1)
-    {
-        Joystick = NULL;
-        return;
-    }
-
-    if (JoystickID >= num)
-        JoystickID = 0;
-
-    Joystick = SDL_JoystickOpen(JoystickID);
-}
-
-bool JoystickButtonDown(int val)
-{
-    if (val == -1) return false;
-
-    bool hasbtn = ((val & 0xFFFF) != 0xFFFF);
-
-    if (hasbtn)
-    {
-        if (val & 0x100)
-        {
-            int hatnum = (val >> 4) & 0xF;
-            int hatdir = val & 0xF;
-            Uint8 hatval = SDL_JoystickGetHat(Joystick, hatnum);
-
-            bool pressed = false;
-            if      (hatdir == 0x1) pressed = (hatval & SDL_HAT_UP);
-            else if (hatdir == 0x4) pressed = (hatval & SDL_HAT_DOWN);
-            else if (hatdir == 0x2) pressed = (hatval & SDL_HAT_RIGHT);
-            else if (hatdir == 0x8) pressed = (hatval & SDL_HAT_LEFT);
-
-            if (pressed) return true;
-        }
-        else
-        {
-            int btnnum = val & 0xFFFF;
-            Uint8 btnval = SDL_JoystickGetButton(Joystick, btnnum);
-
-            if (btnval) return true;
-        }
-    }
-
-    if (val & 0x10000)
-    {
-        int axisnum = (val >> 24) & 0xF;
-        int axisdir = (val >> 20) & 0xF;
-        Sint16 axisval = SDL_JoystickGetAxis(Joystick, axisnum);
-
-        switch (axisdir)
-        {
-        case 0: // positive
-            if (axisval > 16384) return true;
-            break;
-
-        case 1: // negative
-            if (axisval < -16384) return true;
-            break;
-
-        case 2: // trigger
-            if (axisval > 0) return true;
-            break;
-        }
-    }
-
-    return false;
-}
-
-void ProcessInput()
-{
-    SDL_JoystickUpdate();
-
-    if (Joystick)
-    {
-        if (!SDL_JoystickGetAttached(Joystick))
-        {
-            SDL_JoystickClose(Joystick);
-            Joystick = NULL;
-        }
-    }
-    if (!Joystick && (SDL_NumJoysticks() > 0))
-    {
-        JoystickID = Config::JoystickID;
-        OpenJoystick();
-    }
-
-    JoyInputMask = 0xFFF;
-    for (int i = 0; i < 12; i++)
-        if (JoystickButtonDown(Config::JoyMapping[i]))
-            JoyInputMask &= ~(1<<i);
-
-    JoyHotkeyMask = 0;
-    for (int i = 0; i < HK_MAX; i++)
-        if (JoystickButtonDown(Config::HKJoyMapping[i]))
-            JoyHotkeyMask |= (1<<i);
-
-    HotkeyMask = KeyHotkeyMask | JoyHotkeyMask;
-    HotkeyPress = HotkeyMask & ~LastHotkeyMask;
-    HotkeyRelease = LastHotkeyMask & ~HotkeyMask;
-    LastHotkeyMask = HotkeyMask;
-}
-
-bool JoyButtonPressed(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat)
-{
-    if (btnid < 0) return false;
-
-    hat &= ~(hat >> 4);
-
-    bool pressed = false;
-    if (btnid == 0x101) // up
-        pressed = (hat & SDL_HAT_UP);
-    else if (btnid == 0x104) // down
-        pressed = (hat & SDL_HAT_DOWN);
-    else if (btnid == 0x102) // right
-        pressed = (hat & SDL_HAT_RIGHT);
-    else if (btnid == 0x108) // left
-        pressed = (hat & SDL_HAT_LEFT);
-    else if (btnid < njoybuttons)
-        pressed = (joybuttons[btnid] & ~(joybuttons[btnid] >> 1)) & 0x01;
-
-    return pressed;
-}
-
-bool JoyButtonHeld(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat)
-{
-    if (btnid < 0) return false;
-
-    bool pressed = false;
-    if (btnid == 0x101) // up
-        pressed = (hat & SDL_HAT_UP);
-    else if (btnid == 0x104) // down
-        pressed = (hat & SDL_HAT_DOWN);
-    else if (btnid == 0x102) // right
-        pressed = (hat & SDL_HAT_RIGHT);
-    else if (btnid == 0x108) // left
-        pressed = (hat & SDL_HAT_LEFT);
-    else if (btnid < njoybuttons)
-        pressed = joybuttons[btnid] & 0x01;
-
-    return pressed;
-}
-
-void UpdateWindowTitle(void* data)
-{
-    if (EmuStatus == 0) return;
-    void** dataarray = (void**)data;
-    SDL_LockMutex((SDL_mutex*)dataarray[1]);
-    uiWindowSetTitle(MainWindow, (const char*)dataarray[0]);
-    SDL_UnlockMutex((SDL_mutex*)dataarray[1]);
-}
-
-void UpdateFPSLimit(void* data)
-{
-    uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1);
-}
-
-int EmuThreadFunc(void* burp)
-{
-    NDS::Init();
-
-    MainScreenPos[0] = 0;
-    MainScreenPos[1] = 0;
-    MainScreenPos[2] = 0;
-    AutoScreenSizing = 0;
-
-    if (Screen_UseGL)
-    {
-        uiGLMakeContextCurrent(GLContext);
-        GPU3D::InitRenderer(true);
-        uiGLMakeContextCurrent(NULL);
-    }
-    else
-    {
-        GPU3D::InitRenderer(false);
-    }
-
-    Touching = false;
-    KeyInputMask = 0xFFF;
-    JoyInputMask = 0xFFF;
-    KeyHotkeyMask = 0;
-    JoyHotkeyMask = 0;
-    HotkeyMask = 0;
-    LastHotkeyMask = 0;
-    LidStatus = false;
-
-    u32 nframes = 0;
-    u32 starttick = SDL_GetTicks();
-    u32 lasttick = starttick;
-    u32 lastmeasuretick = lasttick;
-    u32 fpslimitcount = 0;
-    u64 perfcount = SDL_GetPerformanceCounter();
-    u64 perffreq = SDL_GetPerformanceFrequency();
-    float samplesleft = 0;
-    u32 nsamples = 0;
-
-    char melontitle[100];
-    SDL_mutex* titlemutex = SDL_CreateMutex();
-    void* titledata[2] = {melontitle, titlemutex};
-
-    while (EmuRunning != 0)
-    {
-        ProcessInput();
-
-        if (HotkeyPressed(HK_FastForwardToggle))
-        {
-            Config::LimitFPS = !Config::LimitFPS;
-            uiQueueMain(UpdateFPSLimit, NULL);
-        }
-        // TODO: similar hotkeys for video/audio sync?
-
-        if (HotkeyPressed(HK_Pause)) uiQueueMain(TogglePause, NULL);
-        if (HotkeyPressed(HK_Reset)) uiQueueMain(Reset, NULL);
-
-        if (GBACart::CartInserted && GBACart::HasSolarSensor)
-        {
-            if (HotkeyPressed(HK_SolarSensorDecrease))
-            {
-                if (GBACart_SolarSensor::LightLevel > 0) GBACart_SolarSensor::LightLevel--;
-                char msg[64];
-                sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel);
-                OSD::AddMessage(0, msg);
-            }
-            if (HotkeyPressed(HK_SolarSensorIncrease))
-            {
-                if (GBACart_SolarSensor::LightLevel < 10) GBACart_SolarSensor::LightLevel++;
-                char msg[64];
-                sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel);
-                OSD::AddMessage(0, msg);
-            }
-        }
-
-        if (EmuRunning == 1)
-        {
-            EmuStatus = 1;
-
-            // process input and hotkeys
-            NDS::SetKeyMask(KeyInputMask & JoyInputMask);
-
-            if (HotkeyPressed(HK_Lid))
-            {
-                LidStatus = !LidStatus;
-                NDS::SetLidClosed(LidStatus);
-                OSD::AddMessage(0, LidStatus ? "Lid closed" : "Lid opened");
-            }
-
-            // microphone input
-            FeedMicInput();
-
-            if (Screen_UseGL)
-            {
-                uiGLBegin(GLContext);
-                uiGLMakeContextCurrent(GLContext);
-            }
-
-            // auto screen layout
-            {
-                MainScreenPos[2] = MainScreenPos[1];
-                MainScreenPos[1] = MainScreenPos[0];
-                MainScreenPos[0] = NDS::PowerControl9 >> 15;
-
-                int guess;
-                if (MainScreenPos[0] == MainScreenPos[2] &&
-                    MainScreenPos[0] != MainScreenPos[1])
-                {
-                    // constant flickering, likely displaying 3D on both screens
-                    // TODO: when both screens are used for 2D only...???
-                    guess = 0;
-                }
-                else
-                {
-                    if (MainScreenPos[0] == 1)
-                        guess = 1;
-                    else
-                        guess = 2;
-                }
-
-                if (guess != AutoScreenSizing)
-                {
-                    AutoScreenSizing = guess;
-                    SetupScreenRects(WindowWidth, WindowHeight);
-                }
-            }
-
-            // emulate
-            u32 nlines = NDS::RunFrame();
-
-#ifdef MELONCAP
-            MelonCap::Update();
-#endif // MELONCAP
-
-            if (EmuRunning == 0) break;
-
-            if (Screen_UseGL)
-            {
-                GLScreen_DrawScreen();
-                uiGLEnd(GLContext);
-            }
-            uiAreaQueueRedrawAll(MainDrawArea);
-
-            bool fastforward = HotkeyDown(HK_FastForward);
-
-            if (Config::AudioSync && !fastforward)
-            {
-                SDL_LockMutex(AudioSyncLock);
-                while (SPU::GetOutputSize() > 1024)
-                {
-                    int ret = SDL_CondWaitTimeout(AudioSync, AudioSyncLock, 500);
-                    if (ret == SDL_MUTEX_TIMEDOUT) break;
-                }
-                SDL_UnlockMutex(AudioSyncLock);
-            }
-            else
-            {
-                // ensure the audio FIFO doesn't overflow
-                //SPU::TrimOutput();
-            }
-
-            float framerate = (1000.0f * nlines) / (60.0f * 263.0f);
-
-            {
-                u32 curtick = SDL_GetTicks();
-                u32 delay = curtick - lasttick;
-
-                bool limitfps = Config::LimitFPS && !fastforward;
-                if (limitfps)
-                {
-                    float wantedtickF = starttick + (framerate * (fpslimitcount+1));
-                    u32 wantedtick = (u32)ceil(wantedtickF);
-                    if (curtick < wantedtick) SDL_Delay(wantedtick - curtick);
-
-                    lasttick = SDL_GetTicks();
-                    fpslimitcount++;
-                    if ((abs(wantedtickF - (float)wantedtick) < 0.001312) || (fpslimitcount > 60))
-                    {
-                        fpslimitcount = 0;
-                        nsamples = 0;
-                        starttick = lasttick;
-                    }
-                }
-                else
-                {
-                    if (delay < 1) SDL_Delay(1);
-                    lasttick = SDL_GetTicks();
-                }
-            }
-
-            nframes++;
-            if (nframes >= 30)
-            {
-                u32 tick = SDL_GetTicks();
-                u32 diff = tick - lastmeasuretick;
-                lastmeasuretick = tick;
-
-                u32 fps;
-                if (diff < 1) fps = 77777;
-                else fps = (nframes * 1000) / diff;
-                nframes = 0;
-
-                float fpstarget;
-                if (framerate < 1) fpstarget = 999;
-                else fpstarget = 1000.0f/framerate;
-
-                SDL_LockMutex(titlemutex);
-                sprintf(melontitle, "[%d/%.0f] melonDS " MELONDS_VERSION, fps, fpstarget);
-                SDL_UnlockMutex(titlemutex);
-                uiQueueMain(UpdateWindowTitle, titledata);
-            }
-        }
-        else
-        {
-            // paused
-            nframes = 0;
-            lasttick = SDL_GetTicks();
-            starttick = lasttick;
-            lastmeasuretick = lasttick;
-            fpslimitcount = 0;
-
-            if (EmuRunning == 2)
-            {
-                if (Screen_UseGL)
-                {
-                    uiGLBegin(GLContext);
-                    uiGLMakeContextCurrent(GLContext);
-                    GLScreen_DrawScreen();
-                    uiGLEnd(GLContext);
-                }
-                uiAreaQueueRedrawAll(MainDrawArea);
-            }
-
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-            EmuStatus = EmuRunning;
-
-            SDL_Delay(100);
-        }
-    }
-
-    EmuStatus = 0;
-
-    SDL_DestroyMutex(titlemutex);
-
-    if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-
-    NDS::DeInit();
-    Platform::LAN_DeInit();
-
-    if (Screen_UseGL)
-    {
-        OSD::DeInit(true);
-        GLScreen_DeInit();
-    }
-    else
-        OSD::DeInit(false);
-
-    if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-    return 44203;
-}
-
-void StopEmuThread()
-{
-    EmuRunning = 0;
-    SDL_WaitThread(EmuThread, NULL);
-}
-
-
-void OnAreaDraw(uiAreaHandler* handler, uiArea* area, uiAreaDrawParams* params)
-{
-    if (!ScreenDrawInited)
-    {
-        if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-        if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-        ScreenDrawInited = true;
-        ScreenBitmap[0] = uiDrawNewBitmap(params->Context, 256, 192, 0);
-        ScreenBitmap[1] = uiDrawNewBitmap(params->Context, 256, 192, 0);
-    }
-
-    int frontbuf = GPU::FrontBuffer;
-    if (!ScreenBitmap[0] || !ScreenBitmap[1]) return;
-    if (!GPU::Framebuffer[frontbuf][0] || !GPU::Framebuffer[frontbuf][1]) return;
-
-    uiRect top = {0, 0, 256, 192};
-    uiRect bot = {0, 0, 256, 192};
-
-    uiDrawBitmapUpdate(ScreenBitmap[0], GPU::Framebuffer[frontbuf][0]);
-    uiDrawBitmapUpdate(ScreenBitmap[1], GPU::Framebuffer[frontbuf][1]);
-
-    uiDrawSave(params->Context);
-    uiDrawTransform(params->Context, &TopScreenTrans);
-    uiDrawBitmapDraw(params->Context, ScreenBitmap[0], &top, &TopScreenRect, Config::ScreenFilter==1);
-    uiDrawRestore(params->Context);
-
-    uiDrawSave(params->Context);
-    uiDrawTransform(params->Context, &BottomScreenTrans);
-    uiDrawBitmapDraw(params->Context, ScreenBitmap[1], &bot, &BottomScreenRect, Config::ScreenFilter==1);
-    uiDrawRestore(params->Context);
-
-    OSD::Update(false, params);
-}
-
-void OnAreaMouseEvent(uiAreaHandler* handler, uiArea* area, uiAreaMouseEvent* evt)
-{
-    int x = (int)evt->X;
-    int y = (int)evt->Y;
-
-    if (Touching && (evt->Up == 1))
-    {
-        Touching = false;
-        NDS::ReleaseKey(16+6);
-        NDS::ReleaseScreen();
-    }
-    else if (!Touching && (evt->Down == 1) &&
-             (x >= BottomScreenRect.X) && (y >= BottomScreenRect.Y) &&
-             (x < (BottomScreenRect.X+BottomScreenRect.Width)) && (y < (BottomScreenRect.Y+BottomScreenRect.Height)))
-    {
-        Touching = true;
-        NDS::PressKey(16+6);
-    }
-
-    if (Touching)
-    {
-        x -= BottomScreenRect.X;
-        y -= BottomScreenRect.Y;
-
-        if (ScreenRotation == 0 || ScreenRotation == 2)
-        {
-            if (BottomScreenRect.Width != 256)
-                x = (x * 256) / BottomScreenRect.Width;
-            if (BottomScreenRect.Height != 192)
-                y = (y * 192) / BottomScreenRect.Height;
-
-            if (ScreenRotation == 2)
-            {
-                x = 255 - x;
-                y = 191 - y;
-            }
-        }
-        else
-        {
-            if (BottomScreenRect.Width != 192)
-                x = (x * 192) / BottomScreenRect.Width;
-            if (BottomScreenRect.Height != 256)
-                y = (y * 256) / BottomScreenRect.Height;
-
-            if (ScreenRotation == 1)
-            {
-                int tmp = x;
-                x = y;
-                y = 191 - tmp;
-            }
-            else
-            {
-                int tmp = x;
-                x = 255 - y;
-                y = tmp;
-            }
-        }
-
-        // clamp
-        if (x < 0) x = 0;
-        else if (x > 255) x = 255;
-        if (y < 0) y = 0;
-        else if (y > 191) y = 191;
-
-        // TODO: take advantage of possible extra precision when possible? (scaled window for example)
-        NDS::TouchScreen(x, y);
-    }
-}
-
-void OnAreaMouseCrossed(uiAreaHandler* handler, uiArea* area, int left)
-{
-}
-
-void OnAreaDragBroken(uiAreaHandler* handler, uiArea* area)
-{
-}
-
-bool EventMatchesKey(uiAreaKeyEvent* evt, int val, bool checkmod)
-{
-    if (val == -1) return false;
-
-    int key = val & 0xFFFF;
-    int mod = val >> 16;
-    return evt->Scancode == key && (!checkmod || evt->Modifiers == mod);
-}
-
-int OnAreaKeyEvent(uiAreaHandler* handler, uiArea* area, uiAreaKeyEvent* evt)
-{
-    // TODO: release all keys if the window loses focus? or somehow global key input?
-    if (evt->Scancode == 0x38) // ALT
-        return 0;
-    if (evt->Modifiers == 0x2) // ALT+key
-        return 0;
-
-    if (evt->Up)
-    {
-        for (int i = 0; i < 12; i++)
-            if (EventMatchesKey(evt, Config::KeyMapping[i], false))
-                KeyInputMask |= (1<<i);
-
-        for (int i = 0; i < HK_MAX; i++)
-            if (EventMatchesKey(evt, Config::HKKeyMapping[i], true))
-                KeyHotkeyMask &= ~(1<<i);
-    }
-    else if (!evt->Repeat)
-    {
-        // TODO, eventually: make savestate keys configurable?
-        // F keys: 3B-44, 57-58 | SHIFT: mod. 0x4
-        if (evt->Scancode >= 0x3B && evt->Scancode <= 0x42) // F1-F8, quick savestate
-        {
-            if      (evt->Modifiers == 0x4) SaveState(1 + (evt->Scancode - 0x3B));
-            else if (evt->Modifiers == 0x0) LoadState(1 + (evt->Scancode - 0x3B));
-        }
-        else if (evt->Scancode == 0x43) // F9, savestate from/to file
-        {
-            if      (evt->Modifiers == 0x4) SaveState(0);
-            else if (evt->Modifiers == 0x0) LoadState(0);
-        }
-        else if (evt->Scancode == 0x58) // F12, undo savestate
-        {
-            if (evt->Modifiers == 0x0) UndoStateLoad();
-        }
-
-        for (int i = 0; i < 12; i++)
-            if (EventMatchesKey(evt, Config::KeyMapping[i], false))
-                KeyInputMask &= ~(1<<i);
-
-        for (int i = 0; i < HK_MAX; i++)
-            if (EventMatchesKey(evt, Config::HKKeyMapping[i], true))
-                KeyHotkeyMask |= (1<<i);
-
-        // REMOVE ME
-        //if (evt->Scancode == 0x57) // F11
-        //    NDS::debug(0);
-    }
-
-    return 1;
-}
-
-void SetupScreenRects(int width, int height)
-{
-    bool horizontal = false;
-    bool sideways = false;
-
-    if (ScreenRotation == 1 || ScreenRotation == 3)
-        sideways = true;
-
-    if (ScreenLayout == 2) horizontal = true;
-    else if (ScreenLayout == 0)
-    {
-        if (sideways)
-            horizontal = true;
-    }
-
-    int sizemode;
-    if (ScreenSizing == 3)
-        sizemode = AutoScreenSizing;
-    else
-        sizemode = ScreenSizing;
-
-    int screenW, screenH, gap;
-    if (sideways)
-    {
-        screenW = 192;
-        screenH = 256;
-    }
-    else
-    {
-        screenW = 256;
-        screenH = 192;
-    }
-
-    gap = ScreenGap;
-
-    uiRect *topscreen, *bottomscreen;
-    if (ScreenRotation == 1 || ScreenRotation == 2)
-    {
-        topscreen = &BottomScreenRect;
-        bottomscreen = &TopScreenRect;
-    }
-    else
-    {
-        topscreen = &TopScreenRect;
-        bottomscreen = &BottomScreenRect;
-    }
-
-    if (horizontal)
-    {
-        // side-by-side
-
-        int heightreq;
-        int startX = 0;
-
-        width -= gap;
-
-        if (sizemode == 0) // even
-        {
-            heightreq = (width * screenH) / (screenW*2);
-            if (heightreq > height)
-            {
-                int newwidth = (height * width) / heightreq;
-                startX = (width - newwidth) / 2;
-                heightreq = height;
-                width = newwidth;
-            }
-        }
-        else // emph. top/bottom
-        {
-            heightreq = ((width - screenW) * screenH) / screenW;
-            if (heightreq > height)
-            {
-                int newwidth = ((height * (width - screenW)) / heightreq) + screenW;
-                startX = (width - newwidth) / 2;
-                heightreq = height;
-                width = newwidth;
-            }
-        }
-
-        if (sizemode == 2)
-        {
-            topscreen->Width = screenW;
-            topscreen->Height = screenH;
-        }
-        else
-        {
-            topscreen->Width = (sizemode==0) ? (width / 2) : (width - screenW);
-            topscreen->Height = heightreq;
-        }
-        topscreen->X = startX;
-        topscreen->Y = ((height - heightreq) / 2) + (heightreq - topscreen->Height);
-
-        bottomscreen->X = topscreen->X + topscreen->Width + gap;
-
-        if (sizemode == 1)
-        {
-            bottomscreen->Width = screenW;
-            bottomscreen->Height = screenH;
-        }
-        else
-        {
-            bottomscreen->Width = width - topscreen->Width;
-            bottomscreen->Height = heightreq;
-        }
-        bottomscreen->Y = ((height - heightreq) / 2) + (heightreq - bottomscreen->Height);
-    }
-    else
-    {
-        // top then bottom
-
-        int widthreq;
-        int startY = 0;
-
-        height -= gap;
-
-        if (sizemode == 0) // even
-        {
-            widthreq = (height * screenW) / (screenH*2);
-            if (widthreq > width)
-            {
-                int newheight = (width * height) / widthreq;
-                startY = (height - newheight) / 2;
-                widthreq = width;
-                height = newheight;
-            }
-        }
-        else // emph. top/bottom
-        {
-            widthreq = ((height - screenH) * screenW) / screenH;
-            if (widthreq > width)
-            {
-                int newheight = ((width * (height - screenH)) / widthreq) + screenH;
-                startY = (height - newheight) / 2;
-                widthreq = width;
-                height = newheight;
-            }
-        }
-
-        if (sizemode == 2)
-        {
-            topscreen->Width = screenW;
-            topscreen->Height = screenH;
-        }
-        else
-        {
-            topscreen->Width = widthreq;
-            topscreen->Height = (sizemode==0) ? (height / 2) : (height - screenH);
-        }
-        topscreen->Y = startY;
-        topscreen->X = (width - topscreen->Width) / 2;
-
-        bottomscreen->Y = topscreen->Y + topscreen->Height + gap;
-
-        if (sizemode == 1)
-        {
-            bottomscreen->Width = screenW;
-            bottomscreen->Height = screenH;
-        }
-        else
-        {
-            bottomscreen->Width = widthreq;
-            bottomscreen->Height = height - topscreen->Height;
-        }
-        bottomscreen->X = (width - bottomscreen->Width) / 2;
-    }
-
-    // setup matrices for potential rotation
-
-    uiDrawMatrixSetIdentity(&TopScreenTrans);
-    uiDrawMatrixSetIdentity(&BottomScreenTrans);
-
-    switch (ScreenRotation)
-    {
-    case 1: // 90°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI/2.0f);
-            uiDrawMatrixScale(&TopScreenTrans, 0, 0,
-                              TopScreenRect.Width/(double)TopScreenRect.Height,
-                              TopScreenRect.Height/(double)TopScreenRect.Width);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI/2.0f);
-            uiDrawMatrixScale(&BottomScreenTrans, 0, 0,
-                              BottomScreenRect.Width/(double)BottomScreenRect.Height,
-                              BottomScreenRect.Height/(double)BottomScreenRect.Width);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y);
-        }
-        break;
-
-    case 2: // 180°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y+TopScreenRect.Height);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y+BottomScreenRect.Height);
-        }
-        break;
-
-    case 3: // 270°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, -M_PI/2.0f);
-            uiDrawMatrixScale(&TopScreenTrans, 0, 0,
-                              TopScreenRect.Width/(double)TopScreenRect.Height,
-                              TopScreenRect.Height/(double)TopScreenRect.Width);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X, TopScreenRect.Y+TopScreenRect.Height);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, -M_PI/2.0f);
-            uiDrawMatrixScale(&BottomScreenTrans, 0, 0,
-                              BottomScreenRect.Width/(double)BottomScreenRect.Height,
-                              BottomScreenRect.Height/(double)BottomScreenRect.Width);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X, BottomScreenRect.Y+BottomScreenRect.Height);
-        }
-        break;
-    }
-
-    GL_ScreenSizeDirty = true;
-}
-
-void SetMinSize(int w, int h)
-{
-    int cw, ch;
-    uiWindowContentSize(MainWindow, &cw, &ch);
-
-    uiControlSetMinSize(uiControl(MainDrawArea), w, h);
-    if ((cw < w) || (ch < h))
-    {
-        if (cw < w) cw = w;
-        if (ch < h) ch = h;
-        uiWindowSetContentSize(MainWindow, cw, ch);
-    }
-}
-
-void OnAreaResize(uiAreaHandler* handler, uiArea* area, int width, int height)
-{
-    SetupScreenRects(width, height);
-
-    // TODO:
-    // should those be the size of the uiArea, or the size of the window client area?
-    // for now the uiArea fills the whole window anyway
-    // but... we never know, I guess
-    WindowWidth = width;
-    WindowHeight = height;
-
-    int ismax = uiWindowMaximized(MainWindow);
-    int ismin = uiWindowMinimized(MainWindow);
-
-    Config::WindowMaximized = ismax;
-    if (!ismax && !ismin)
-    {
-        Config::WindowWidth = width;
-        Config::WindowHeight = height;
-    }
-
-    OSD::WindowResized(Screen_UseGL);
-}
-
-
-void Run()
-{
-    EmuRunning = 1;
-    RunningSomething = true;
-
-    SPU::InitOutput();
-    AudioSampleFrac = 0;
-    SDL_PauseAudioDevice(AudioDevice, 0);
-    SDL_PauseAudioDevice(MicDevice, 0);
-
-    uiMenuItemEnable(MenuItem_SaveState);
-    uiMenuItemEnable(MenuItem_LoadState);
-
-    if (SavestateLoaded)
-        uiMenuItemEnable(MenuItem_UndoStateLoad);
-    else
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    for (int i = 0; i < 8; i++)
-    {
-        char ssfile[1024];
-        GetSavestateName(i+1, ssfile, 1024);
-        if (Platform::FileExists(ssfile)) uiMenuItemEnable(MenuItem_LoadStateSlot[i]);
-        else                              uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    }
-
-    for (int i = 0; i < 9; i++) uiMenuItemEnable(MenuItem_SaveStateSlot[i]);
-    uiMenuItemEnable(MenuItem_LoadStateSlot[8]);
-
-    uiMenuItemEnable(MenuItem_Pause);
-    uiMenuItemEnable(MenuItem_Reset);
-    uiMenuItemEnable(MenuItem_Stop);
-    uiMenuItemSetChecked(MenuItem_Pause, 0);
-}
-
-void TogglePause(void* blarg)
-{
-    if (!RunningSomething) return;
-
-    if (EmuRunning == 1)
-    {
-        // enable pause
-        EmuRunning = 2;
-        uiMenuItemSetChecked(MenuItem_Pause, 1);
-
-        SPU::DrainOutput();
-        SDL_PauseAudioDevice(AudioDevice, 1);
-        SDL_PauseAudioDevice(MicDevice, 1);
-
-        OSD::AddMessage(0, "Paused");
-    }
-    else
-    {
-        // disable pause
-        EmuRunning = 1;
-        uiMenuItemSetChecked(MenuItem_Pause, 0);
-
-        SPU::InitOutput();
-        AudioSampleFrac = 0;
-        SDL_PauseAudioDevice(AudioDevice, 0);
-        SDL_PauseAudioDevice(MicDevice, 0);
-
-        OSD::AddMessage(0, "Resumed");
-    }
-}
-
-void Reset(void* blarg)
-{
-    if (!RunningSomething) return;
-
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    SavestateLoaded = false;
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    if (ROMPath[0][0] == '\0')
-        NDS::LoadBIOS();
-    else
-    {
-        SetupSRAMPath(0);
-        NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot);
-    }
-
-    if (ROMPath[1][0] != '\0')
-    {
-        SetupSRAMPath(1);
-        NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-    }
-
-    Run();
-
-    OSD::AddMessage(0, "Reset");
-}
-
-void Stop(bool internal)
-{
-    EmuRunning = 2;
-    if (!internal) // if shutting down from the UI thread, wait till the emu thread has stopped
-        while (EmuStatus != 2);
-    RunningSomething = false;
-
-    // eject any inserted GBA cartridge
-    GBACart::Eject();
-    ROMPath[1][0] = '\0';
-
-    uiWindowSetTitle(MainWindow, "melonDS " MELONDS_VERSION);
-
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]);
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    uiMenuItemDisable(MenuItem_Pause);
-    uiMenuItemDisable(MenuItem_Reset);
-    uiMenuItemDisable(MenuItem_Stop);
-    uiMenuItemSetChecked(MenuItem_Pause, 0);
-
-    uiAreaQueueRedrawAll(MainDrawArea);
-
-    SPU::DrainOutput();
-    SDL_PauseAudioDevice(AudioDevice, 1);
-    SDL_PauseAudioDevice(MicDevice, 1);
-
-    OSD::AddMessage(0xFFC040, "Shutdown");
-}
-
-void SetupSRAMPath(int slot)
-{
-    strncpy(SRAMPath[slot], ROMPath[slot], 1023);
-    SRAMPath[slot][1023] = '\0';
-    strncpy(SRAMPath[slot] + strlen(ROMPath[slot]) - 3, "sav", 3);
-}
-
-void TryLoadROM(char* file, int slot, int prevstatus)
-{
-    char oldpath[1024];
-    char oldsram[1024];
-    strncpy(oldpath, ROMPath[slot], 1024);
-    strncpy(oldsram, SRAMPath[slot], 1024);
-
-    strncpy(ROMPath[slot], file, 1023);
-    ROMPath[slot][1023] = '\0';
-
-    SetupSRAMPath(0);
-    SetupSRAMPath(1);
-
-    if (slot == 0 && NDS::LoadROM(ROMPath[slot], SRAMPath[slot], Config::DirectBoot))
-    {
-        SavestateLoaded = false;
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-        // Reload the inserted GBA cartridge (if any)
-        if (ROMPath[1][0] != '\0') NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-
-        strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety
-        Run();
-    }
-    else if (slot == 1 && NDS::LoadGBAROM(ROMPath[slot], SRAMPath[slot]))
-    {
-        SavestateLoaded = false;
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-        strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety
-        if (RunningSomething) Run(); // do not start just from a GBA cart
-    }
-    else
-    {
-        uiMsgBoxError(MainWindow,
-                      "Failed to load the ROM",
-                      "Make sure the file can be accessed and isn't opened in another application.");
-
-        strncpy(ROMPath[slot], oldpath, 1024);
-        strncpy(SRAMPath[slot], oldsram, 1024);
-        EmuRunning = prevstatus;
-    }
-}
-
-
-// SAVESTATE TODO
-// * configurable paths. not everyone wants their ROM directory to be polluted, I guess.
-
-void GetSavestateName(int slot, char* filename, int len)
-{
-    int pos;
-
-    if (ROMPath[0][0] == '\0') // running firmware, no ROM
-    {
-        strcpy(filename, "firmware");
-        pos = 8;
-    }
-    else
-    {
-        int l = strlen(ROMPath[0]);
-        pos = l;
-        while (ROMPath[0][pos] != '.' && pos > 0) pos--;
-        if (pos == 0) pos = l;
-
-        // avoid buffer overflow. shoddy
-        if (pos > len-5) pos = len-5;
-
-        strncpy(&filename[0], ROMPath[0], pos);
-    }
-    strcpy(&filename[pos], ".ml");
-    filename[pos+3] = '0'+slot;
-    filename[pos+4] = '\0';
-}
-
-void LoadState(int slot)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char filename[1024];
-
-    if (slot > 0)
-    {
-        GetSavestateName(slot, filename, 1024);
-    }
-    else
-    {
-        char* file = uiOpenFile(MainWindow, "melonDS savestate (any)|*.ml1;*.ml2;*.ml3;*.ml4;*.ml5;*.ml6;*.ml7;*.ml8;*.mln", Config::LastROMFolder);
-        if (!file)
-        {
-            EmuRunning = prevstatus;
-            return;
-        }
-
-        strncpy(filename, file, 1023);
-        filename[1023] = '\0';
-        uiFreeText(file);
-    }
-
-    if (!Platform::FileExists(filename))
-    {
-        char msg[64];
-        if (slot > 0) sprintf(msg, "State slot %d is empty", slot);
-        else          sprintf(msg, "State file does not exist");
-        OSD::AddMessage(0xFFA0A0, msg);
-
-        EmuRunning = prevstatus;
-        return;
-    }
-
-    u32 oldGBACartCRC = GBACart::CartCRC;
-
-    // backup
-    Savestate* backup = new Savestate("timewarp.mln", true);
-    NDS::DoSavestate(backup);
-    delete backup;
-
-    bool failed = false;
-
-    Savestate* state = new Savestate(filename, false);
-    if (state->Error)
-    {
-        delete state;
-
-        uiMsgBoxError(MainWindow, "Error", "Could not load savestate file.");
-
-        // current state might be crapoed, so restore from sane backup
-        state = new Savestate("timewarp.mln", false);
-        failed = true;
-    }
-
-    NDS::DoSavestate(state);
-    delete state;
-
-    if (!failed)
-    {
-        if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0')
-        {
-            strncpy(PrevSRAMPath[0], SRAMPath[0], 1024);
-
-            strncpy(SRAMPath[0], filename, 1019);
-            int len = strlen(SRAMPath[0]);
-            strcpy(&SRAMPath[0][len], ".sav");
-            SRAMPath[0][len+4] = '\0';
-
-            NDS::RelocateSave(SRAMPath[0], false);
-        }
-
-        bool loadedPartialGBAROM = false;
-
-        // in case we have a GBA cart inserted, and the GBA ROM changes
-        // due to having loaded a save state, we do not want to reload
-        // the previous cartridge on reset, or commit writes to any
-        // loaded save file. therefore, their paths are "nulled".
-        if (GBACart::CartInserted && GBACart::CartCRC != oldGBACartCRC)
-        {
-            ROMPath[1][0] = '\0';
-            SRAMPath[1][0] = '\0';
-            loadedPartialGBAROM = true;
-        }
-
-        char msg[64];
-        if (slot > 0) sprintf(msg, "State loaded from slot %d%s",
-                        slot, loadedPartialGBAROM ? " (GBA ROM header only)" : "");
-        else          sprintf(msg, "State loaded from file%s",
-                        loadedPartialGBAROM ? " (GBA ROM header only)" : "");
-        OSD::AddMessage(0, msg);
-
-        SavestateLoaded = true;
-        uiMenuItemEnable(MenuItem_UndoStateLoad);
-    }
-
-    EmuRunning = prevstatus;
-}
-
-void SaveState(int slot)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char filename[1024];
-
-    if (slot > 0)
-    {
-        GetSavestateName(slot, filename, 1024);
-    }
-    else
-    {
-        char* file = uiSaveFile(MainWindow, "melonDS savestate (*.mln)|*.mln", Config::LastROMFolder);
-        if (!file)
-        {
-            EmuRunning = prevstatus;
-            return;
-        }
-
-        strncpy(filename, file, 1023);
-        filename[1023] = '\0';
-        uiFreeText(file);
-    }
-
-    Savestate* state = new Savestate(filename, true);
-    if (state->Error)
-    {
-        delete state;
-
-        uiMsgBoxError(MainWindow, "Error", "Could not save state.");
-    }
-    else
-    {
-        NDS::DoSavestate(state);
-        delete state;
-
-        if (slot > 0)
-            uiMenuItemEnable(MenuItem_LoadStateSlot[slot-1]);
-
-        if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0')
-        {
-            strncpy(SRAMPath[0], filename, 1019);
-            int len = strlen(SRAMPath[0]);
-            strcpy(&SRAMPath[0][len], ".sav");
-            SRAMPath[0][len+4] = '\0';
-
-            NDS::RelocateSave(SRAMPath[0], true);
-        }
-    }
-
-    char msg[64];
-    if (slot > 0) sprintf(msg, "State saved to slot %d", slot);
-    else          sprintf(msg, "State saved to file");
-    OSD::AddMessage(0, msg);
-
-    EmuRunning = prevstatus;
-}
-
-void UndoStateLoad()
-{
-    if (!SavestateLoaded) return;
-
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    // pray that this works
-    // what do we do if it doesn't???
-    // but it should work.
-    Savestate* backup = new Savestate("timewarp.mln", false);
-    NDS::DoSavestate(backup);
-    delete backup;
-
-    if (ROMPath[0][0]!='\0')
-    {
-        strncpy(SRAMPath[0], PrevSRAMPath[0], 1024);
-        NDS::RelocateSave(SRAMPath[0], false);
-    }
-
-    OSD::AddMessage(0, "State load undone");
-
-    EmuRunning = prevstatus;
-}
-
-
-void CloseAllDialogs()
-{
-    DlgAudioSettings::Close();
-    DlgEmuSettings::Close();
-    DlgInputConfig::Close(0);
-    DlgInputConfig::Close(1);
-    DlgVideoSettings::Close();
-    DlgWifiSettings::Close();
-}
-
-
-int OnCloseWindow(uiWindow* window, void* blarg)
-{
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    CloseAllDialogs();
-    StopEmuThread();
-    uiQuit();
-    return 1;
-}
-
-void OnDropFile(uiWindow* window, char* file, void* blarg)
-{
-    char* ext = &file[strlen(file)-3];
-    int prevstatus = EmuRunning;
-
-    if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl"))
-    {
-        if (RunningSomething)
-        {
-            EmuRunning = 2;
-            while (EmuStatus != 2);
-        }
-
-        TryLoadROM(file, 0, prevstatus);
-    }
-    else if (!strcasecmp(ext, "gba"))
-    {
-        TryLoadROM(file, 1, prevstatus);
-    }
-}
-
-void OnGetFocus(uiWindow* window, void* blarg)
-{
-    uiControlSetFocus(uiControl(MainDrawArea));
-}
-
-void OnLoseFocus(uiWindow* window, void* blarg)
-{
-    // TODO: shit here?
-}
-
-void OnCloseByMenu(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    CloseAllDialogs();
-    StopEmuThread();
-    DestroyMainWindow();
-    uiQuit();
-}
-
-void OnOpenFile(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char* file = uiOpenFile(window, "DS ROM (*.nds)|*.nds;*.srl|GBA ROM (*.gba)|*.gba|Any file|*.*", Config::LastROMFolder);
-    if (!file)
-    {
-        EmuRunning = prevstatus;
-        return;
-    }
-
-    int pos = strlen(file)-1;
-    while (file[pos] != '/' && file[pos] != '\\' && pos > 0) pos--;
-    strncpy(Config::LastROMFolder, file, pos);
-    Config::LastROMFolder[pos] = '\0';
-    char* ext = &file[strlen(file)-3];
-
-    if (!strcasecmp(ext, "gba"))
-    {
-        TryLoadROM(file, 1, prevstatus);
-    }
-    else
-    {
-        TryLoadROM(file, 0, prevstatus);
-    }
-
-    uiFreeText(file);
-}
-
-void OnSaveState(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int slot = *(int*)param;
-    SaveState(slot);
-}
-
-void OnLoadState(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int slot = *(int*)param;
-    LoadState(slot);
-}
-
-void OnUndoStateLoad(uiMenuItem* item, uiWindow* window, void* param)
-{
-    UndoStateLoad();
-}
-
-void OnRun(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    if (!RunningSomething)
-    {
-        ROMPath[0][0] = '\0';
-        NDS::LoadBIOS();
-
-        if (ROMPath[1][0] != '\0')
-        {
-            SetupSRAMPath(1);
-            NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-        }
-    }
-
-    Run();
-}
-
-void OnPause(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    TogglePause(NULL);
-}
-
-void OnReset(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    Reset(NULL);
-}
-
-void OnStop(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    if (!RunningSomething) return;
-
-    Stop(false);
-}
-
-void OnOpenEmuSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgEmuSettings::Open();
-}
-
-void OnOpenInputConfig(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgInputConfig::Open(0);
-}
-
-void OnOpenHotkeyConfig(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgInputConfig::Open(1);
-}
-
-void OnOpenVideoSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgVideoSettings::Open();
-}
-
-void OnOpenAudioSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgAudioSettings::Open();
-}
-
-void OnOpenWifiSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgWifiSettings::Open();
-}
-
-
-void OnSetSavestateSRAMReloc(uiMenuItem* item, uiWindow* window, void* param)
-{
-    Config::SavestateRelocSRAM = uiMenuItemChecked(item) ? 1:0;
-}
-
-
-void EnsureProperMinSize()
-{
-    bool isHori = (ScreenRotation == 1 || ScreenRotation == 3);
-
-    int w0 = 256;
-    int h0 = 192;
-    int w1 = 256;
-    int h1 = 192;
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori)
-            SetMinSize(h0+ScreenGap+h1, std::max(w0,w1));
-        else
-            SetMinSize(std::max(w0,w1), h0+ScreenGap+h1);
-    }
-    else if (ScreenLayout == 1) // vertical
-    {
-        if (isHori)
-            SetMinSize(std::max(h0,h1), w0+ScreenGap+w1);
-        else
-            SetMinSize(std::max(w0,w1), h0+ScreenGap+h1);
-    }
-    else // horizontal
-    {
-        if (isHori)
-            SetMinSize(h0+ScreenGap+h1, std::max(w0,w1));
-        else
-            SetMinSize(w0+ScreenGap+w1, std::max(h0,h1));
-    }
-}
-
-void OnSetScreenSize(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int factor = *(int*)param;
-    bool isHori = (ScreenRotation == 1 || ScreenRotation == 3);
-
-    int w = 256*factor;
-    int h = 192*factor;
-
-    // FIXME
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, (h*2)+ScreenGap, w);
-        else
-            uiWindowSetContentSize(window, w, (h*2)+ScreenGap);
-    }
-    else if (ScreenLayout == 1) // vertical
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, h, (w*2)+ScreenGap);
-        else
-            uiWindowSetContentSize(window, w, (h*2)+ScreenGap);
-    }
-    else // horizontal
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, (h*2)+ScreenGap, w);
-        else
-            uiWindowSetContentSize(window, (w*2)+ScreenGap, h);
-    }
-}
-
-void OnSetScreenRotation(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int rot = *(int*)param;
-
-    int oldrot = ScreenRotation;
-    ScreenRotation = rot;
-
-    int w, h;
-    uiWindowContentSize(window, &w, &h);
-
-    bool isHori = (rot == 1 || rot == 3);
-    bool wasHori = (oldrot == 1 || oldrot == 3);
-
-    EnsureProperMinSize();
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori ^ wasHori)
-        {
-            int blarg = h;
-            h = w;
-            w = blarg;
-
-            uiWindowSetContentSize(window, w, h);
-        }
-    }
-
-    SetupScreenRects(w, h);
-
-    for (int i = 0; i < 4; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenRot[i], i==ScreenRotation);
-}
-
-void OnSetScreenGap(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int gap = *(int*)param;
-
-    //int oldgap = ScreenGap;
-    ScreenGap = gap;
-
-    EnsureProperMinSize();
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 6; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenGap[i], kScreenGap[i]==ScreenGap);
-}
-
-void OnSetScreenLayout(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int layout = *(int*)param;
-    ScreenLayout = layout;
-
-    EnsureProperMinSize();
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 3; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenLayout[i], i==ScreenLayout);
-}
-
-void OnSetScreenSizing(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int sizing = *(int*)param;
-    ScreenSizing = sizing;
-
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 4; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenSizing[i], i==ScreenSizing);
-}
-
-void OnSetScreenFiltering(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::ScreenFilter = 1;
-    else          Config::ScreenFilter = 0;
-}
-
-void OnSetLimitFPS(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::LimitFPS = true;
-    else          Config::LimitFPS = false;
-}
-
-void OnSetAudioSync(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::AudioSync = true;
-    else          Config::AudioSync = false;
-}
-
-void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::ShowOSD = true;
-    else          Config::ShowOSD = false;
-}
-
-void ApplyNewSettings(int type)
-{
-#ifdef JIT_ENABLED
-    if (type == 4)
-    {
-        Reset(NULL);
-        return;
-    }
-#endif
-
-    if (!RunningSomething)
-    {
-        if (type == 1) return;
-    }
-
-    int prevstatus = EmuRunning;
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    if (type == 0) // 3D renderer settings
-    {
-        if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-        GPU3D::UpdateRendererConfig();
-        if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-        GL_3DScale = Config::GL_ScaleFactor; // dorp
-        GL_ScreenSizeDirty = true;
-    }
-    else if (type == 1) // wifi settings
-    {
-        if (Wifi::MPInited)
-        {
-            Platform::MP_DeInit();
-            Platform::MP_Init();
-        }
-
-        Platform::LAN_DeInit();
-        Platform::LAN_Init();
-    }
-    else if (type == 2) // video output method
-    {
-        bool usegl = Config::ScreenUseGL || (Config::_3DRenderer != 0);
-        if (usegl != Screen_UseGL)
-        {
-            if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-            GPU3D::DeInitRenderer();
-            OSD::DeInit(Screen_UseGL);
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-            Screen_UseGL = usegl;
-            RecreateMainWindow(usegl);
-
-            if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-            GPU3D::InitRenderer(Screen_UseGL);
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-        }
-    }
-    else if (type == 3) // 3D renderer
-    {
-        if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-        GPU3D::DeInitRenderer();
-        GPU3D::InitRenderer(Screen_UseGL);
-        if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-    }
-    EmuRunning = prevstatus;
-}
-
-
-void CreateMainWindowMenu()
-{
-    uiMenu* menu;
-    uiMenuItem* menuitem;
-
-    menu = uiNewMenu("File");
-    menuitem = uiMenuAppendItem(menu, "Open ROM...");
-    uiMenuItemOnClicked(menuitem, OnOpenFile, NULL);
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Save state");
-
-        for (int i = 0; i < 9; i++)
-        {
-            char name[32];
-            if (i < 8)
-                sprintf(name, "%d\tShift+F%d", kSavestateNum[i], kSavestateNum[i]);
-            else
-                strcpy(name, "File...\tShift+F9");
-
-            uiMenuItem* ssitem = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(ssitem, OnSaveState, (void*)&kSavestateNum[i]);
-
-            MenuItem_SaveStateSlot[i] = ssitem;
-        }
-
-        MenuItem_SaveState = uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Load state");
-
-        for (int i = 0; i < 9; i++)
-        {
-            char name[32];
-            if (i < 8)
-                sprintf(name, "%d\tF%d", kSavestateNum[i], kSavestateNum[i]);
-            else
-                strcpy(name, "File...\tF9");
-
-            uiMenuItem* ssitem = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(ssitem, OnLoadState, (void*)&kSavestateNum[i]);
-
-            MenuItem_LoadStateSlot[i] = ssitem;
-        }
-
-        MenuItem_LoadState = uiMenuAppendSubmenu(menu, submenu);
-    }
-    menuitem = uiMenuAppendItem(menu, "Undo state load\tF12");
-    uiMenuItemOnClicked(menuitem, OnUndoStateLoad, NULL);
-    MenuItem_UndoStateLoad = menuitem;
-    uiMenuAppendSeparator(menu);
-    menuitem = uiMenuAppendItem(menu, "Quit");
-    uiMenuItemOnClicked(menuitem, OnCloseByMenu, NULL);
-
-    menu = uiNewMenu("System");
-    menuitem = uiMenuAppendItem(menu, "Run");
-    uiMenuItemOnClicked(menuitem, OnRun, NULL);
-    menuitem = uiMenuAppendCheckItem(menu, "Pause");
-    uiMenuItemOnClicked(menuitem, OnPause, NULL);
-    MenuItem_Pause = menuitem;
-    uiMenuAppendSeparator(menu);
-    menuitem = uiMenuAppendItem(menu, "Reset");
-    uiMenuItemOnClicked(menuitem, OnReset, NULL);
-    MenuItem_Reset = menuitem;
-    menuitem = uiMenuAppendItem(menu, "Stop");
-    uiMenuItemOnClicked(menuitem, OnStop, NULL);
-    MenuItem_Stop = menuitem;
-
-    menu = uiNewMenu("Config");
-    {
-        menuitem = uiMenuAppendItem(menu, "Emu settings");
-        uiMenuItemOnClicked(menuitem, OnOpenEmuSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Input config");
-        uiMenuItemOnClicked(menuitem, OnOpenInputConfig, NULL);
-        menuitem = uiMenuAppendItem(menu, "Hotkey config");
-        uiMenuItemOnClicked(menuitem, OnOpenHotkeyConfig, NULL);
-        menuitem = uiMenuAppendItem(menu, "Video settings");
-        uiMenuItemOnClicked(menuitem, OnOpenVideoSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Audio settings");
-        uiMenuItemOnClicked(menuitem, OnOpenAudioSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Wifi settings");
-        uiMenuItemOnClicked(menuitem, OnOpenWifiSettings, NULL);
-    }
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Savestate settings");
-
-        MenuItem_SavestateSRAMReloc = uiMenuAppendCheckItem(submenu, "Separate savefiles");
-        uiMenuItemOnClicked(MenuItem_SavestateSRAMReloc, OnSetSavestateSRAMReloc, NULL);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Screen size");
-
-        for (int i = 0; i < 4; i++)
-        {
-            char name[32];
-            sprintf(name, "%dx", kScreenSize[i]);
-            uiMenuItem* item = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(item, OnSetScreenSize, (void*)&kScreenSize[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen rotation");
-
-        for (int i = 0; i < 4; i++)
-        {
-            char name[32];
-            sprintf(name, "%d", kScreenRot[i]*90);
-            MenuItem_ScreenRot[i] = uiMenuAppendCheckItem(submenu, name);
-            uiMenuItemOnClicked(MenuItem_ScreenRot[i], OnSetScreenRotation, (void*)&kScreenRot[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Mid-screen gap");
-
-        //for (int i = 0; kScreenGap[i] != -1; i++)
-        for (int i = 0; i < 6; i++)
-        {
-            char name[32];
-            sprintf(name, "%d pixels", kScreenGap[i]);
-            MenuItem_ScreenGap[i] = uiMenuAppendCheckItem(submenu, name);
-            uiMenuItemOnClicked(MenuItem_ScreenGap[i], OnSetScreenGap, (void*)&kScreenGap[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen layout");
-
-        MenuItem_ScreenLayout[0] = uiMenuAppendCheckItem(submenu, "Natural");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[0], OnSetScreenLayout, (void*)&kScreenLayout[0]);
-        MenuItem_ScreenLayout[1] = uiMenuAppendCheckItem(submenu, "Vertical");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[1], OnSetScreenLayout, (void*)&kScreenLayout[1]);
-        MenuItem_ScreenLayout[2] = uiMenuAppendCheckItem(submenu, "Horizontal");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[2], OnSetScreenLayout, (void*)&kScreenLayout[2]);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen sizing");
-
-        MenuItem_ScreenSizing[0] = uiMenuAppendCheckItem(submenu, "Even");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[0], OnSetScreenSizing, (void*)&kScreenSizing[0]);
-        MenuItem_ScreenSizing[1] = uiMenuAppendCheckItem(submenu, "Emphasize top");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[1], OnSetScreenSizing, (void*)&kScreenSizing[1]);
-        MenuItem_ScreenSizing[2] = uiMenuAppendCheckItem(submenu, "Emphasize bottom");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[2], OnSetScreenSizing, (void*)&kScreenSizing[2]);
-        MenuItem_ScreenSizing[3] = uiMenuAppendCheckItem(submenu, "Auto");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[3], OnSetScreenSizing, (void*)&kScreenSizing[3]);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-
-    MenuItem_ScreenFilter = uiMenuAppendCheckItem(menu, "Screen filtering");
-    uiMenuItemOnClicked(MenuItem_ScreenFilter, OnSetScreenFiltering, NULL);
-
-    MenuItem_ShowOSD = uiMenuAppendCheckItem(menu, "Show OSD");
-    uiMenuItemOnClicked(MenuItem_ShowOSD, OnSetShowOSD, NULL);
-
-    uiMenuAppendSeparator(menu);
-
-    MenuItem_LimitFPS = uiMenuAppendCheckItem(menu, "Limit framerate");
-    uiMenuItemOnClicked(MenuItem_LimitFPS, OnSetLimitFPS, NULL);
-
-    MenuItem_AudioSync = uiMenuAppendCheckItem(menu, "Audio sync");
-    uiMenuItemOnClicked(MenuItem_AudioSync, OnSetAudioSync, NULL);
-}
-
-void CreateMainWindow(bool opengl)
-{
-    MainWindow = uiNewWindow("melonDS " MELONDS_VERSION,
-                             WindowWidth, WindowHeight,
-                             Config::WindowMaximized, 1, 1);
-    uiWindowOnClosing(MainWindow, OnCloseWindow, NULL);
-
-    uiWindowSetDropTarget(MainWindow, 1);
-    uiWindowOnDropFile(MainWindow, OnDropFile, NULL);
-
-    uiWindowOnGetFocus(MainWindow, OnGetFocus, NULL);
-    uiWindowOnLoseFocus(MainWindow, OnLoseFocus, NULL);
-
-    ScreenDrawInited = false;
-    bool opengl_good = opengl;
-
-    if (!opengl) MainDrawArea = uiNewArea(&MainDrawAreaHandler);
-    else         MainDrawArea = uiNewGLArea(&MainDrawAreaHandler, kGLVersions);
-
-    uiWindowSetChild(MainWindow, uiControl(MainDrawArea));
-    uiControlSetMinSize(uiControl(MainDrawArea), 256, 384);
-    uiAreaSetBackgroundColor(MainDrawArea, 0, 0, 0);
-
-    uiControlShow(uiControl(MainWindow));
-    uiControlSetFocus(uiControl(MainDrawArea));
-
-    if (opengl_good)
-    {
-        GLContext = uiAreaGetGLContext(MainDrawArea);
-        if (!GLContext) opengl_good = false;
-    }
-    if (opengl_good)
-    {
-        uiGLMakeContextCurrent(GLContext);
-        uiGLSetVSync(Config::ScreenVSync);
-        if (!GLScreen_Init()) opengl_good = false;
-        if (opengl_good)
-        {
-            OpenGL_UseShaderProgram(GL_ScreenShaderOSD);
-            OSD::Init(true);
-        }
-        uiGLMakeContextCurrent(NULL);
-    }
-
-    if (opengl && !opengl_good)
-    {
-        printf("OpenGL: initialization failed\n");
-        RecreateMainWindow(false);
-        Screen_UseGL = false;
-    }
-
-    if (!opengl) OSD::Init(false);
-}
-
-void DestroyMainWindow()
-{
-    uiControlDestroy(uiControl(MainWindow));
-
-    if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-    if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-    ScreenBitmap[0] = NULL;
-    ScreenBitmap[1] = NULL;
-}
-
-void RecreateMainWindow(bool opengl)
-{
-    int winX, winY, maxi;
-    uiWindowPosition(MainWindow, &winX, &winY);
-    maxi = uiWindowMaximized(MainWindow);
-    DestroyMainWindow();
-    CreateMainWindow(opengl);
-    uiWindowSetPosition(MainWindow, winX, winY);
-    uiWindowSetMaximized(MainWindow, maxi);
-}
-
-
-int main(int argc, char** argv)
-{
-    srand(time(NULL));
-
-    printf("melonDS " MELONDS_VERSION "\n");
-    printf(MELONDS_URL "\n");
-
-#if defined(__WIN32__) || defined(UNIX_PORTABLE)
-    if (argc > 0 && strlen(argv[0]) > 0)
-    {
-        int len = strlen(argv[0]);
-        while (len > 0)
-        {
-            if (argv[0][len] == '/') break;
-            if (argv[0][len] == '\\') break;
-            len--;
-        }
-        if (len > 0)
-        {
-            EmuDirectory = new char[len+1];
-            strncpy(EmuDirectory, argv[0], len);
-            EmuDirectory[len] = '\0';
-        }
-        else
-        {
-            EmuDirectory = new char[2];
-            strcpy(EmuDirectory, ".");
-        }
-    }
-    else
-    {
-        EmuDirectory = new char[2];
-        strcpy(EmuDirectory, ".");
-    }
-#else
-	const char* confdir = g_get_user_config_dir();
-	const char* confname = "/melonDS";
-	EmuDirectory = new char[strlen(confdir) + strlen(confname) + 1];
-	strcat(EmuDirectory, confdir);
-	strcat(EmuDirectory, confname);
-#endif
-
-    // http://stackoverflow.com/questions/14543333/joystick-wont-work-using-sdl
-    SDL_SetHint(SDL_HINT_JOYSTICK_ALLOW_BACKGROUND_EVENTS, "1");
-
-    if (SDL_Init(SDL_INIT_HAPTIC) < 0)
-    {
-        printf("SDL couldn't init rumble\n");
-    }
-    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_JOYSTICK) < 0)
-    {
-        printf("SDL shat itself :(\n");
-        return 1;
-    }
-
-    SDL_JoystickEventState(SDL_ENABLE);
-
-    uiInitOptions ui_opt;
-    memset(&ui_opt, 0, sizeof(uiInitOptions));
-    const char* ui_err = uiInit(&ui_opt);
-    if (ui_err != NULL)
-    {
-        printf("libui shat itself :( %s\n", ui_err);
-        uiFreeInitError(ui_err);
-        return 1;
-    }
-
-    Config::Load();
-
-    if      (Config::AudioVolume < 0)   Config::AudioVolume = 0;
-    else if (Config::AudioVolume > 256) Config::AudioVolume = 256;
-
-    if (!Platform::LocalFileExists("bios7.bin") ||
-        !Platform::LocalFileExists("bios9.bin") ||
-        !Platform::LocalFileExists("firmware.bin"))
-    {
-#if defined(__WIN32__) || defined(UNIX_PORTABLE)
-		const char* locationName = "the directory you run melonDS from";
-#else
-		char* locationName = EmuDirectory;
-#endif
-		char msgboxtext[512];
-		sprintf(msgboxtext,
-            "One or more of the following required files don't exist or couldn't be accessed:\n\n"
-            "bios7.bin -- ARM7 BIOS\n"
-            "bios9.bin -- ARM9 BIOS\n"
-            "firmware.bin -- firmware image\n\n"
-            "Dump the files from your DS and place them in %s.\n"
-            "Make sure that the files can be accessed.",
-			locationName
-		);
-
-        uiMsgBoxError(NULL, "BIOS/Firmware not found", msgboxtext);
-
-        uiUninit();
-        SDL_Quit();
-        return 0;
-    }
-    if (!Platform::LocalFileExists("firmware.bin.bak"))
-    {
-        // verify the firmware
-        //
-        // there are dumps of an old hacked firmware floating around on the internet
-        // and those are problematic
-        // the hack predates WFC, and, due to this, any game that alters the WFC
-        // access point data will brick that firmware due to it having critical
-        // data in the same area. it has the same problem on hardware.
-        //
-        // but this should help stop users from reporting that issue over and over
-        // again, when the issue is not from melonDS but from their firmware dump.
-        //
-        // I don't know about all the firmware hacks in existence, but the one I
-        // looked at has 0x180 bytes from the header repeated at 0x3FC80, but
-        // bytes 0x0C-0x14 are different.
-
-        FILE* f = Platform::OpenLocalFile("firmware.bin", "rb");
-        u8 chk1[0x180], chk2[0x180];
-
-        fseek(f, 0, SEEK_SET);
-        fread(chk1, 1, 0x180, f);
-        fseek(f, -0x380, SEEK_END);
-        fread(chk2, 1, 0x180, f);
-
-        memset(&chk1[0x0C], 0, 8);
-        memset(&chk2[0x0C], 0, 8);
-
-        fclose(f);
-
-        if (!memcmp(chk1, chk2, 0x180))
-        {
-            uiMsgBoxError(NULL,
-                          "Problematic firmware dump",
-                          "You are using an old hacked firmware dump.\n"
-                          "Firmware boot will stop working if you run any game that alters WFC settings.\n\n"
-                          "Note that the issue is not from melonDS, it would also happen on an actual DS.");
-        }
-    }
-    {
-        const char* romlist_missing = "Save memory type detection will not work correctly.\n\n"
-            "You should use the latest version of romlist.bin (provided in melonDS release packages).";
-#if !defined(UNIX_PORTABLE) && !defined(__WIN32__)
-        std::string missingstr = std::string(romlist_missing) +
-            "\n\nThe ROM list should be placed in " + g_get_user_data_dir() + "/melonds/, otherwise "
-            "melonDS will search for it in the current working directory.";
-        const char* romlist_missing_text = missingstr.c_str();
-#else
-        const char* romlist_missing_text = romlist_missing;
-#endif
-
-        FILE* f = Platform::OpenDataFile("romlist.bin");
-        if (f)
-        {
-            u32 data;
-            fread(&data, 4, 1, f);
-            fclose(f);
-
-            if ((data >> 24) == 0) // old CRC-based list
-            {
-                uiMsgBoxError(NULL, "Your version of romlist.bin is outdated.", romlist_missing_text);
-            }
-        }
-        else
-        {
-        	uiMsgBoxError(NULL, "romlist.bin not found.", romlist_missing_text);
-        }
-    }
-
-    CreateMainWindowMenu();
-
-    MainDrawAreaHandler.Draw = OnAreaDraw;
-    MainDrawAreaHandler.MouseEvent = OnAreaMouseEvent;
-    MainDrawAreaHandler.MouseCrossed = OnAreaMouseCrossed;
-    MainDrawAreaHandler.DragBroken = OnAreaDragBroken;
-    MainDrawAreaHandler.KeyEvent = OnAreaKeyEvent;
-    MainDrawAreaHandler.Resize = OnAreaResize;
-
-    WindowWidth = Config::WindowWidth;
-    WindowHeight = Config::WindowHeight;
-
-    Screen_UseGL = Config::ScreenUseGL || (Config::_3DRenderer != 0);
-
-    GL_3DScale = Config::GL_ScaleFactor;
-    if      (GL_3DScale < 1) GL_3DScale = 1;
-    else if (GL_3DScale > 8) GL_3DScale = 8;
-
-    CreateMainWindow(Screen_UseGL);
-
-    ScreenRotation = Config::ScreenRotation;
-    ScreenGap = Config::ScreenGap;
-    ScreenLayout = Config::ScreenLayout;
-    ScreenSizing = Config::ScreenSizing;
-
-#define SANITIZE(var, min, max)  if ((var < min) || (var > max)) var = 0;
-    SANITIZE(ScreenRotation, 0, 3);
-    SANITIZE(ScreenLayout, 0, 2);
-    SANITIZE(ScreenSizing, 0, 3);
-#undef SANITIZE
-
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]);
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    uiMenuItemDisable(MenuItem_Pause);
-    uiMenuItemDisable(MenuItem_Reset);
-    uiMenuItemDisable(MenuItem_Stop);
-
-    uiMenuItemSetChecked(MenuItem_SavestateSRAMReloc, Config::SavestateRelocSRAM?1:0);
-
-    uiMenuItemSetChecked(MenuItem_ScreenRot[ScreenRotation], 1);
-    uiMenuItemSetChecked(MenuItem_ScreenLayout[ScreenLayout], 1);
-    uiMenuItemSetChecked(MenuItem_ScreenSizing[ScreenSizing], 1);
-
-    for (int i = 0; i < 6; i++)
-    {
-        if (ScreenGap == kScreenGap[i])
-            uiMenuItemSetChecked(MenuItem_ScreenGap[i], 1);
-    }
-
-    OnSetScreenRotation(MenuItem_ScreenRot[ScreenRotation], MainWindow, (void*)&kScreenRot[ScreenRotation]);
-
-    uiMenuItemSetChecked(MenuItem_ScreenFilter, Config::ScreenFilter==1);
-    uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1);
-    uiMenuItemSetChecked(MenuItem_AudioSync, Config::AudioSync==1);
-    uiMenuItemSetChecked(MenuItem_ShowOSD, Config::ShowOSD==1);
-
-#ifdef MELONCAP
-    MelonCap::Init();
-#endif // MELONCAP
-
-    AudioSync = SDL_CreateCond();
-    AudioSyncLock = SDL_CreateMutex();
-
-    AudioFreq = 48000; // TODO: make configurable?
-    SDL_AudioSpec whatIwant, whatIget;
-    memset(&whatIwant, 0, sizeof(SDL_AudioSpec));
-    whatIwant.freq = AudioFreq;
-    whatIwant.format = AUDIO_S16LSB;
-    whatIwant.channels = 2;
-    whatIwant.samples = 1024;
-    whatIwant.callback = AudioCallback;
-    AudioDevice = SDL_OpenAudioDevice(NULL, 0, &whatIwant, &whatIget, SDL_AUDIO_ALLOW_FREQUENCY_CHANGE);
-    if (!AudioDevice)
-    {
-        printf("Audio init failed: %s\n", SDL_GetError());
-    }
-    else
-    {
-        AudioFreq = whatIget.freq;
-        printf("Audio output frequency: %d Hz\n", AudioFreq);
-        SDL_PauseAudioDevice(AudioDevice, 1);
-    }
-
-    memset(&whatIwant, 0, sizeof(SDL_AudioSpec));
-    whatIwant.freq = 44100;
-    whatIwant.format = AUDIO_S16LSB;
-    whatIwant.channels = 1;
-    whatIwant.samples = 1024;
-    whatIwant.callback = MicCallback;
-    MicDevice = SDL_OpenAudioDevice(NULL, 1, &whatIwant, &whatIget, 0);
-    if (!MicDevice)
-    {
-        printf("Mic init failed: %s\n", SDL_GetError());
-        MicBufferLength = 0;
-    }
-    else
-    {
-        SDL_PauseAudioDevice(MicDevice, 1);
-    }
-
-    memset(MicBuffer, 0, sizeof(MicBuffer));
-    MicBufferReadPos = 0;
-    MicBufferWritePos = 0;
-
-    MicWavBuffer = NULL;
-    if (Config::MicInputType == 3) MicLoadWav(Config::MicWavPath);
-
-    JoystickID = Config::JoystickID;
-    Joystick = NULL;
-    OpenJoystick();
-
-    EmuRunning = 2;
-    RunningSomething = false;
-    EmuThread = SDL_CreateThread(EmuThreadFunc, "melonDS magic", NULL);
-
-    if (argc > 1)
-    {
-        char* file = argv[1];
-        char* ext = &file[strlen(file)-3];
-
-        if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl"))
-        {
-            strncpy(ROMPath[0], file, 1023);
-            ROMPath[0][1023] = '\0';
-
-            SetupSRAMPath(0);
-
-            if (NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot))
-                Run();
-        }
-
-        if (argc > 2)
-        {
-            file = argv[2];
-            ext = &file[strlen(file)-3];
-
-            if (!strcasecmp(ext, "gba"))
-            {
-                strncpy(ROMPath[1], file, 1023);
-                ROMPath[1][1023] = '\0';
-
-                SetupSRAMPath(1);
-
-                NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-            }
-        }
-    }
-
-    uiMain();
-
-    if (Joystick) SDL_JoystickClose(Joystick);
-    if (AudioDevice) SDL_CloseAudioDevice(AudioDevice);
-    if (MicDevice)   SDL_CloseAudioDevice(MicDevice);
-
-    SDL_DestroyCond(AudioSync);
-    SDL_DestroyMutex(AudioSyncLock);
-
-    if (MicWavBuffer) delete[] MicWavBuffer;
-
-#ifdef MELONCAP
-    MelonCap::DeInit();
-#endif // MELONCAP
-
-    if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-    if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-    Config::ScreenRotation = ScreenRotation;
-    Config::ScreenGap = ScreenGap;
-    Config::ScreenLayout = ScreenLayout;
-    Config::ScreenSizing = ScreenSizing;
-
-    Config::Save();
-
-    uiUninit();
-    SDL_Quit();
-    delete[] EmuDirectory;
-    return 0;
-}
-
-#ifdef __WIN32__
-
-#include <windows.h>
-
-int CALLBACK WinMain(HINSTANCE hinst, HINSTANCE hprev, LPSTR cmdline, int cmdshow)
-{
-    int argc = 0;
-    wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc);
-    char* nullarg = "";
-
-    char** argv = new char*[argc];
-    for (int i = 0; i < argc; i++)
-    {
-        int len = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, NULL, 0, NULL, NULL);
-        if (len < 1) return NULL;
-        argv[i] = new char[len];
-        int res = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, argv[i], len, NULL, NULL);
-        if (res != len) { delete[] argv[i]; argv[i] = nullarg; }
-    }
-
-    if (AttachConsole(ATTACH_PARENT_PROCESS))
-    {
-        freopen("CONOUT$", "w", stdout);
-        freopen("CONOUT$", "w", stderr);
-        printf("\n");
-    }
-
-    int ret = main(argc, argv);
-
-    printf("\n\n>");
-
-    for (int i = 0; i < argc; i++) if (argv[i] != nullarg) delete[] argv[i];
-    delete[] argv;
-
-    return ret;
-}
-
-#endif
-- 
cgit v1.2.3