Merge pull request #667 from Arisotura/generic_jit

merge jit
author: Arisotura <thetotalworm@gmail.com> 2020-07-01 00:01:11 +0200
committer: GitHub <noreply@github.com> 2020-07-01 00:01:11 +0200
commit: 62c6e2f703d88660e0ca9bda78032c5bd6b63a78 (patch)
tree: 1dbf9eb1bbe418d14f07dc3a0e30821fb5deb258
parent: d97ce22b010e868437c649911bce89d679a4deaa (diff)
parent: c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd (diff)
66 files changed, 27783 insertions, 460 deletions
diff --git a/.gitignore b/.gitignore
index dd81614..3c87740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ melon_grc.h
 cmake-build
 cmake-build-debug
 .idea
+
+*.exe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 885f0dd..6729e73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,42 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+detect_architecture("__x86_64__" x86_64)
+detect_architecture("__i386__" x86)
+detect_architecture("__arm__" ARM)
+detect_architecture("__aarch64__" ARM64)
+
+if (ARCHITECTURE STREQUAL x86_64 OR ARCHITECTURE STREQUAL ARM64)
+	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
+endif()
+
+if (ENABLE_JIT)
+	add_definitions(-DJIT_ENABLED)
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL Release)
+	option(ENABLE_LTO "Enable link-time optimization" ON)
+else()
+	option(ENABLE_LTO "Enable link-time optimization" OFF)
+endif()
+
 if (CMAKE_BUILD_TYPE STREQUAL Debug)
 	add_compile_options(-Og)
 endif()
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 68cac59..8530795 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,8 +21,15 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMInterpreter.h"
+#include "Config.h"
 #include "AREngine.h"
+#include "ARMJIT.h"
+#include "Config.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
 
 // instruction timing notes
 //
@@ -72,7 +79,9 @@ ARM::~ARM()
 
 ARMv5::ARMv5() : ARM(0)
 {
-    //
+#ifndef JIT_ENABLED
+    DTCM = new u8[DTCMSize];
+#endif
 }
 
 ARMv4::ARMv4() : ARM(1)
@@ -80,6 +89,13 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
+ARMv5::~ARMv5()
+{
+#ifndef JIT_ENABLED
+    delete[] DTCM;
+#endif
+}
+
 void ARM::Reset()
 {
     Cycles = 0;
@@ -96,6 +112,12 @@ void ARM::Reset()
 
     CodeMem.Mem = NULL;
 
+#ifdef JIT_ENABLED
+    FastBlockLookup = NULL;
+    FastBlockLookupStart = 0;
+    FastBlockLookupSize = 0;
+#endif
+
     // zorp
     JumpTo(ExceptionBase);
 }
@@ -123,7 +145,6 @@ void ARMv5::Reset()
         GetMemRegion = NDS::ARM9GetMemRegion;
     }
 
-    CP15Reset();
     ARM::Reset();
 }
 
@@ -158,7 +179,11 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&Halted);
+
+    // hack to make save states compatible
+    u32 halted = Halted;
+    file->Var32(&halted);
+    Halted = halted;
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
@@ -168,6 +193,15 @@ void ARM::DoSavestate(Savestate* file)
     file->VarArray(R_IRQ, 3*sizeof(u32));
     file->VarArray(R_UND, 3*sizeof(u32));
     file->Var32(&CurInstr);
+#ifdef JIT_ENABLED
+    if (!file->Saving && Config::JIT_Enable)
+    {
+        // hack, the JIT doesn't really pipeline
+        // but we still want JIT save states to be
+        // loaded while running the interpreter
+        FillPipeline();
+    }
+#endif
     file->VarArray(NextInstr, 2*sizeof(u32));
 
     file->Var32(&ExceptionBase);
@@ -240,15 +274,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (addr & 0x2)
         {
             NextInstr[0] = CodeRead32(addr-2, true) >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
             NextInstr[1] = CodeRead32(addr+2, false);
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
         else
         {
             NextInstr[0] = CodeRead32(addr, true);
             NextInstr[1] = NextInstr[0] >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
 
         CPSR |= 0x20;
@@ -261,9 +295,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (newregion != oldregion) SetupCodeMem(addr);
 
         NextInstr[0] = CodeRead32(addr, true);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
         NextInstr[1] = CodeRead32(addr+4, false);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
 
         CPSR &= ~0x20;
     }
@@ -303,7 +337,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead16(addr);
         NextInstr[1] = CodeRead16(addr+2);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
 
         CPSR |= 0x20;
     }
@@ -316,7 +350,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead32(addr);
         NextInstr[1] = CodeRead32(addr+4);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
 
         CPSR &= ~0x20;
     }
@@ -558,7 +592,7 @@ void ARMv5::Execute()
             else
                 AddCycles_C();
         }
-
+ 
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -575,7 +609,7 @@ void ARMv5::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM9Timestamp += Cycles;
+        NDS::ARM9Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -583,6 +617,75 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
+void ARMv5::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(0))
+        {
+            Halted = 0;
+            if (NDS::IME[0] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM9Timestamp < NDS::ARM9Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            printf("ARMv5 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
+        if (block)
+            ARM_Dispatch(this, block);
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1;
+
+        if (StopExecution)
+        {
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
+            {
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+                {
+                    NDS::ARM9Timestamp = NDS::ARM9Target;
+                }
+                break;
+            }
+        }
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
+#endif
+
 void ARMv4::Execute()
 {
     if (Halted)
@@ -652,10 +755,131 @@ void ARMv4::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM7Timestamp += Cycles;
+        NDS::ARM7Timestamp -= Cycles;
         Cycles = 0;
     }
 
     if (Halted == 2)
         Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
+}
+
+#ifdef JIT_ENABLED
+void ARMv4::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(1))
+        {
+            Halted = 0;
+            if (NDS::IME[1] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM7Timestamp < NDS::ARM7Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            printf("ARMv4 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
+        if (block)
+            ARM_Dispatch(this, block);
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1;
+
+        // TODO optimize this shit!!!
+        if (StopExecution)
+        {
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
+            {
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+                {
+                    NDS::ARM7Timestamp = NDS::ARM7Target;
+                }
+                break;
+            }
+        }
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
+}
+#endif
+
+void ARMv5::FillPipeline()
+{
+    SetupCodeMem(R[15]);
+
+    if (CPSR & 0x20)
+    {
+        if ((R[15] - 2) & 0x2)
+        {
+            NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16;
+            NextInstr[1] = CodeRead32(R[15], false);
+        }
+        else
+        {
+            NextInstr[0] = CodeRead32(R[15] - 2, false);
+            NextInstr[1] = NextInstr[0] >> 16;
+        }
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4, false);
+        NextInstr[1] = CodeRead32(R[15], false);
+    }
 }
+
+void ARMv4::FillPipeline()
+{
+    SetupCodeMem(R[15]);
+
+    if (CPSR & 0x20)
+    {
+        NextInstr[0] = CodeRead16(R[15] - 2);
+        NextInstr[1] = CodeRead16(R[15]);
+    }
+    else
+    {
+        NextInstr[0] = CodeRead32(R[15] - 4);
+        NextInstr[1] = CodeRead32(R[15]);
+    }
+}
+\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index e0832e2..0248e26 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -32,16 +32,21 @@ enum
     RWFlags_ForceUser = (1<<21),
 };
 
+const u32 ITCMPhysicalSize = 0x8000;
+const u32 DTCMPhysicalSize = 0x4000;
+
 class ARM
 {
 public:
     ARM(u32 num);
-    ~ARM(); // destroy shit
+    virtual ~ARM(); // destroy shit
 
     virtual void Reset();
 
     virtual void DoSavestate(Savestate* file);
 
+    virtual void FillPipeline() = 0;
+
     virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0;
     void RestoreCPSR();
 
@@ -52,6 +57,9 @@ public:
     }
 
     virtual void Execute() = 0;
+#ifdef ENABLE_JIT
+    virtual void ExecuteJIT() = 0;
+#endif
 
     bool CheckCondition(u32 code)
     {
@@ -107,9 +115,16 @@ public:
     u32 Num;
 
     s32 Cycles;
-    u32 Halted;
-
-    u32 IRQ; // nonzero to trigger IRQ
+    union
+    {
+        struct
+        {
+            u8 Halted;
+            u8 IRQ; // nonzero to trigger IRQ
+            u8 IdleLoop;
+        };
+        u32 StopExecution;
+    };
 
     u32 CodeRegion;
     s32 CodeCycles;
@@ -131,6 +146,11 @@ public:
 
     NDS::MemRegion CodeMem;
 
+#ifdef JIT_ENABLED
+    u32 FastBlockLookupStart, FastBlockLookupSize;
+    u64* FastBlockLookup;
+#endif
+
     static u32 ConditionTable[16];
 
 protected:
@@ -146,6 +166,7 @@ class ARMv5 : public ARM
 {
 public:
     ARMv5();
+    ~ARMv5();
 
     void Reset();
 
@@ -153,12 +174,17 @@ public:
 
     void UpdateRegionTimings(u32 addrstart, u32 addrend);
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void PrefetchAbort();
     void DataAbort();
 
     void Execute();
+#ifdef JIT_ENABLED
+    void ExecuteJIT();
+#endif
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -176,14 +202,14 @@ public:
     {
         // code only. always nonseq 32-bit for ARM9.
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC;
+        Cycles -= numC;
     }
 
     void AddCycles_CI(s32 numI)
     {
         // code+internal
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC + numI;
+        Cycles -= numC + numI;
     }
 
     void AddCycles_CDI()
@@ -194,9 +220,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void AddCycles_CD()
@@ -206,9 +232,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void GetCodeMemRegion(u32 addr, NDS::MemRegion* region);
@@ -237,10 +263,14 @@ public:
 
     u32 DTCMSetting, ITCMSetting;
 
-    u8 ITCM[0x8000];
+    // for aarch64 JIT they need to go up here
+    // to be addressable by a 12-bit immediate
     u32 ITCMSize;
-    u8 DTCM[0x4000];
     u32 DTCMBase, DTCMSize;
+    s32 RegionCodeCycles;
+
+    u8 ITCM[ITCMPhysicalSize];
+    u8* DTCM;
 
     u8 ICache[0x2000];
     u32 ICacheTags[64*4];
@@ -265,7 +295,6 @@ public:
     // code/16N/32N/32S
     u8 MemTimings[0x100000][4];
 
-    s32 RegionCodeCycles;
     u8* CurICacheLine;
 
     bool (*GetMemRegion)(u32 addr, bool write, NDS::MemRegion* region);
@@ -278,9 +307,14 @@ public:
 
     void Reset();
 
+    void FillPipeline();
+
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+#ifdef JIT_ENABLED
+    void ExecuteJIT();
+#endif
 
     u16 CodeRead16(u32 addr)
     {
@@ -295,8 +329,8 @@ public:
     void DataRead8(u32 addr, u32* val)
     {
         *val = BusRead8(addr);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead16(u32 addr, u32* val)
@@ -304,8 +338,8 @@ public:
         addr &= ~1;
 
         *val = BusRead16(addr);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead32(u32 addr, u32* val)
@@ -313,8 +347,8 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataRead32S(u32 addr, u32* val)
@@ -322,14 +356,14 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
     void DataWrite8(u32 addr, u8 val)
     {
         BusWrite8(addr, val);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite16(u32 addr, u16 val)
@@ -337,8 +371,8 @@ public:
         addr &= ~1;
 
         BusWrite16(addr, val);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite32(u32 addr, u32 val)
@@ -346,8 +380,8 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataRegion = addr;
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataWrite32S(u32 addr, u32 val)
@@ -355,20 +389,20 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
 
     void AddCycles_C()
     {
         // code only. this code fetch is sequential.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
     }
 
     void AddCycles_CI(s32 num)
     {
         // code+internal. results in a nonseq code fetch.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
     }
 
     void AddCycles_CDI()
@@ -377,24 +411,24 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02) // mainRAM
+        if ((DataRegion >> 24) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
             {
                 numC++;
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
             }
         }
         else if (CodeRegion == 0x02)
         {
             numD++;
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD + 1;
+            Cycles -= numC + numD + 1;
         }
     }
 
@@ -404,20 +438,20 @@ public:
         s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2];
         s32 numD = DataCycles;
 
-        if (DataRegion == 0x02)
+        if ((DataRegion >> 24) == 0x02)
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else if (CodeRegion == 0x02)
         {
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD;
+            Cycles -= numC + numD;
         }
     }
 };
@@ -430,4 +464,12 @@ void T_UNK(ARM* cpu);
 
 }
 
+namespace NDS
+{
+
+extern ARMv5* ARM9;
+extern ARMv4* ARM7;
+
+}
+
 #endif // ARM_H
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index 7244238..2bf8167 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -28,6 +28,15 @@ namespace ARMInterpreter
 extern void (*ARMInstrTable[4096])(ARM* cpu);
 extern void (*THUMBInstrTable[1024])(ARM* cpu);
 
+void A_MSR_IMM(ARM* cpu);
+void A_MSR_REG(ARM* cpu);
+void A_MRS(ARM* cpu);
+void A_MCR(ARM* cpu);
+void A_MRC(ARM* cpu);
+void A_SVC(ARM* cpu);
+
+void T_SVC(ARM* cpu);
+
 void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
 
 }
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
new file mode 100644
index 0000000..2a61c38
--- /dev/null
+++ b/src/ARMJIT.cpp
@@ -0,0 +1,1204 @@
+#include "ARMJIT.h"
+
+#include <string.h>
+#include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
+#include "Config.h"
+
+#include "ARMJIT_Internal.h"
+#include "ARMJIT_Memory.h"
+#include "ARMJIT_Compiler.h"
+
+#include "ARMInterpreter_ALU.h"
+#include "ARMInterpreter_LoadStore.h"
+#include "ARMInterpreter_Branch.h"
+#include "ARMInterpreter.h"
+
+#include "DSi.h"
+#include "GPU.h"
+#include "GPU3D.h"
+#include "SPU.h"
+#include "Wifi.h"
+#include "NDSCart.h"
+
+#include "ARMJIT_x64/ARMJIT_Offsets.h"
+static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset);
+static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset);
+static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset);
+
+namespace ARMJIT
+{
+
+#define JIT_DEBUGPRINT(msg, ...)
+//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
+
+Compiler* JITCompiler;
+
+AddressRange CodeIndexITCM[ITCMPhysicalSize / 512];
+AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512];
+AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512];
+AddressRange CodeIndexVRAM[0x100000 / 512];
+AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512];
+AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512];
+AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512];
+AddressRange CodeIndexARM7WVRAM[0x40000 / 512];
+AddressRange CodeIndexBIOS9DSi[0x10000 / 512];
+AddressRange CodeIndexBIOS7DSi[0x10000 / 512];
+AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512];
+
+std::unordered_map<u32, JitBlock*> JitBlocks9;
+std::unordered_map<u32, JitBlock*> JitBlocks7;
+
+u64 FastBlockLookupITCM[ITCMPhysicalSize / 2];
+u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2];
+u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2];
+u64 FastBlockLookupVRAM[0x100000 / 2];
+u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2];
+u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2];
+u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2];
+u64 FastBlockLookupARM7WVRAM[0x40000 / 2];
+u64 FastBlockLookupBIOS9DSi[0x10000 / 2];
+u64 FastBlockLookupBIOS7DSi[0x10000 / 2];
+u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2];
+
+const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
+{
+	0,
+	ITCMPhysicalSize,
+	0,
+	sizeof(NDS::ARM9BIOS),
+	NDS::MainRAMMaxSize,
+	NDS::SharedWRAMSize,
+	0,
+	0x100000,
+	sizeof(NDS::ARM7BIOS),
+	NDS::ARM7WRAMSize,
+	0,
+	0,
+	0x40000,
+	0x10000,
+	0x10000,
+	sizeof(DSi::NWRAM_A),
+	sizeof(DSi::NWRAM_B),
+	sizeof(DSi::NWRAM_C),
+};
+
+AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
+{
+	NULL,
+	CodeIndexITCM,
+	NULL,
+	CodeIndexARM9BIOS,
+	CodeIndexMainRAM,
+	CodeIndexSWRAM,
+	NULL,
+	CodeIndexVRAM,
+	CodeIndexARM7BIOS,
+	CodeIndexARM7WRAM,
+	NULL,
+	NULL,
+	CodeIndexARM7WVRAM,
+	CodeIndexBIOS9DSi,
+	CodeIndexBIOS7DSi,
+	CodeIndexNWRAM_A,
+	CodeIndexNWRAM_B,
+	CodeIndexNWRAM_C
+};
+
+u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
+{
+	NULL,
+	FastBlockLookupITCM,
+	NULL,
+	FastBlockLookupARM9BIOS,
+	FastBlockLookupMainRAM,
+	FastBlockLookupSWRAM,
+	NULL,
+	FastBlockLookupVRAM,
+	FastBlockLookupARM7BIOS,
+	FastBlockLookupARM7WRAM,
+	NULL,
+	NULL,
+	FastBlockLookupARM7WVRAM,
+	FastBlockLookupBIOS9DSi,
+	FastBlockLookupBIOS7DSi,
+	FastBlockLookupNWRAM_A,
+	FastBlockLookupNWRAM_B,
+	FastBlockLookupNWRAM_C
+};
+
+u32 LocaliseCodeAddress(u32 num, u32 addr)
+{
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(addr)
+		: ARMJIT_Memory::ClassifyAddress7(addr);
+
+	if (CodeMemRegions[region])
+		return ARMJIT_Memory::LocaliseAddress(region, num, addr);
+	return 0;
+}
+
+TinyVector<u32> InvalidLiterals;
+
+template <typename T, int ConsoleType>
+T SlowRead9(u32 addr, ARMv5* cpu)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (addr < cpu->ITCMSize)
+		val = *(T*)&cpu->ITCM[addr & 0x7FFF];
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+		val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF];
+	else if (std::is_same<T, u32>::value)
+		val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr);
+	else if (std::is_same<T, u16>::value)
+		val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr);
+	else
+		val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T, int ConsoleType>
+void SlowWrite9(u32 addr, ARMv5* cpu, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+    if (addr < cpu->ITCMSize)
+	{
+        CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
+		*(T*)&cpu->ITCM[addr & 0x7FFF] = val;
+	}
+	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
+	{
+		*(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val;
+	}
+	else if (std::is_same<T, u32>::value)
+	{
+		(ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val);
+	}
+	else if (std::is_same<T, u16>::value)
+	{
+		(ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val);
+	}
+	else
+	{
+		(ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val);
+	}
+}
+
+template <typename T, int ConsoleType>
+T SlowRead7(u32 addr)
+{
+	u32 offset = addr & 0x3;
+	addr &= ~(sizeof(T) - 1);
+
+	T val;
+	if (std::is_same<T, u32>::value)
+		val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr);
+	else if (std::is_same<T, u16>::value)
+		val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr);
+	else
+		val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr);
+
+	if (std::is_same<T, u32>::value)
+		return ROR(val, offset << 3);
+	else
+		return val;
+}
+
+template <typename T, int ConsoleType>
+void SlowWrite7(u32 addr, T val)
+{
+	addr &= ~(sizeof(T) - 1);
+
+	if (std::is_same<T, u32>::value)
+		(ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val);
+	else if (std::is_same<T, u16>::value)
+		(ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val);
+	else
+		(ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val);
+}
+
+template <bool Write, int ConsoleType>
+void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		if (Write)
+			SlowWrite9<u32, ConsoleType>(addr, cpu, data[i]);
+		else
+			data[i] = SlowRead9<u32, ConsoleType>(addr, cpu);
+		addr += 4;
+	}
+}
+
+template <bool Write, int ConsoleType>
+void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
+{
+	addr &= ~0x3;
+	for (int i = 0; i < num; i++)
+	{
+		if (Write)
+			SlowWrite7<u32, ConsoleType>(addr, data[i]);
+		else
+			data[i] = SlowRead7<u32, ConsoleType>(addr);
+		addr += 4;
+	}
+}
+
+#define INSTANTIATE_SLOWMEM(consoleType) \
+	template void SlowWrite9<u32, consoleType>(u32, ARMv5*, u32); \
+	template void SlowWrite9<u16, consoleType>(u32, ARMv5*, u16); \
+	template void SlowWrite9<u8, consoleType>(u32, ARMv5*, u8); \
+	\
+	template u32 SlowRead9<u32, consoleType>(u32, ARMv5*); \
+	template u16 SlowRead9<u16, consoleType>(u32, ARMv5*); \
+	template u8 SlowRead9<u8, consoleType>(u32, ARMv5*); \
+	\
+	template void SlowWrite7<u32, consoleType>(u32, u32); \
+	template void SlowWrite7<u16, consoleType>(u32, u16); \
+	template void SlowWrite7<u8, consoleType>(u32, u8); \
+	\
+	template u32 SlowRead7<u32, consoleType>(u32); \
+	template u16 SlowRead7<u16, consoleType>(u32); \
+	template u8 SlowRead7<u8, consoleType>(u32); \
+	\
+	template void SlowBlockTransfer9<false, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer9<true, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer7<false, consoleType>(u32 addr, u64* data, u32 num); \
+	template void SlowBlockTransfer7<true, consoleType>(u32 addr, u64* data, u32 num); \
+
+INSTANTIATE_SLOWMEM(0)
+INSTANTIATE_SLOWMEM(1)
+
+template <typename K, typename V, int Size, V InvalidValue>
+struct UnreliableHashTable
+{
+	struct Bucket
+	{
+		K KeyA, KeyB;
+		V ValA, ValB;
+	};
+
+	Bucket Table[Size];
+
+	void Reset()
+	{
+		for (int i = 0; i < Size; i++)
+		{
+			Table[i].ValA = Table[i].ValB = InvalidValue;
+		}
+	}
+
+	UnreliableHashTable()
+	{
+		Reset();
+	}
+
+	V Insert(K key, V value)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA == value || bucket->ValB == value)
+		{
+			return InvalidValue;
+		}
+		else if (bucket->ValA == InvalidValue)
+		{
+			bucket->KeyA = key;
+			bucket->ValA = value;
+		}
+		else if (bucket->ValB == InvalidValue)
+		{
+			bucket->KeyB = key;
+			bucket->ValB = value;
+		}
+		else
+		{
+			V prevVal = bucket->ValB;
+			bucket->KeyB = bucket->KeyA;
+			bucket->ValB = bucket->ValA;
+			bucket->KeyA = key;
+			bucket->ValA = value;
+			return prevVal;
+		}
+
+		return InvalidValue;
+	}
+
+	void Remove(K key)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->KeyA == key && bucket->ValA != InvalidValue)
+		{
+			bucket->ValA = InvalidValue;
+			if (bucket->ValB != InvalidValue)
+			{
+				bucket->KeyA = bucket->KeyB;
+				bucket->ValA = bucket->ValB;
+				bucket->ValB = InvalidValue;
+			}
+		}
+		if (bucket->KeyB == key && bucket->ValB != InvalidValue)
+			bucket->ValB = InvalidValue;
+	}
+
+	V LookUp(K addr)
+	{
+		u32 slot = XXH3_64bits(&addr, 4) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA != InvalidValue && bucket->KeyA == addr)
+			return bucket->ValA;
+		if (bucket->ValB != InvalidValue && bucket->KeyB == addr)
+			return bucket->ValB;
+
+		return InvalidValue;
+	}
+};
+
+UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
+
+void Init()
+{
+	JITCompiler = new Compiler();
+
+	ARMJIT_Memory::Init();
+}
+
+void DeInit()
+{
+	ARMJIT_Memory::DeInit();
+
+	delete JITCompiler;
+}
+
+void Reset()
+{
+	ResetBlockCache();
+
+	ARMJIT_Memory::Reset();
+}
+
+void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+{
+	for (int j = start; j >= 0; j--)
+	{
+		u8 match = instrs[j].Info.WriteFlags & flags;
+		u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags;
+		if (matchMaybe) // writes flags maybe
+			instrs[j].SetFlags |= matchMaybe;
+		if (match)
+		{
+			instrs[j].SetFlags |= match;
+			flags &= ~match;
+			if (!flags)
+				return;
+		}
+	}
+}
+
+bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr)
+{
+	if (!thumb)
+	{
+		switch (instr.Info.Kind)
+		{
+		case ARMInstrInfo::ak_LDR_IMM:
+		case ARMInstrInfo::ak_LDRB_IMM:
+			addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		case ARMInstrInfo::ak_LDRH_IMM:
+			addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1));
+			return true;
+		default:
+			break;
+		}
+	}
+	else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL)
+	{
+    	addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2);
+		return true;
+	}
+
+	JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind);
+	return false;
+}
+
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
+	u32& linkAddr, u32& targetAddr)
+{
+	if (thumb)
+	{
+		u32 r15 = instr.Addr + 4;
+		cond = 0xE;
+
+		link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
+		linkAddr = instr.Addr + 4;
+
+		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
+		{
+			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
+    		targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_B)
+		{
+			s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
+		{
+			cond = (instr.Instr >> 8) & 0xF;
+			s32 offset = (s32)(instr.Instr << 24) >> 23;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
+	}
+	else
+	{
+		link = instr.Info.Kind == ARMInstrInfo::ak_BL;
+		linkAddr = instr.Addr + 4;
+
+		cond = instr.Cond();
+		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
+			|| instr.Info.Kind == ARMInstrInfo::ak_B)
+		{
+			s32 offset = (s32)(instr.Instr << 8) >> 6;
+			u32 r15 = instr.Addr + 8;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
+{
+	// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
+	// it basically checks if one iteration of a loop depends on another
+	// the rules are quite simple
+
+	JIT_DEBUGPRINT("checking potential idle loop\n");
+	u16 regsWrittenTo = 0;
+	u16 regsDisallowedToWrite = 0;
+	for (int i = 0; i < instrsCount; i++)
+	{
+		JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
+			return false;
+		if (i < instrsCount - 1 && instrs[i].Info.Branches())
+			return false;
+
+		u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
+		u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
+
+		regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
+		
+		if (dstRegs & regsDisallowedToWrite)
+			return false;
+		regsWrittenTo |= dstRegs;
+	}
+	return true;
+}
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+
+void NOP(ARM* cpu) {}
+
+#define F(x) &ARMInterpreter::A_##x
+#define F_ALU(name, s) \
+	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
+	F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
+#define F_MEM_WB(name) \
+	F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
+	F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
+#define F_MEM_HD(name) \
+	F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
+InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
+{
+	F_ALU(AND,), F_ALU(AND,_S),
+	F_ALU(EOR,), F_ALU(EOR,_S),
+	F_ALU(SUB,), F_ALU(SUB,_S),
+	F_ALU(RSB,), F_ALU(RSB,_S),
+	F_ALU(ADD,), F_ALU(ADD,_S),
+	F_ALU(ADC,), F_ALU(ADC,_S),
+	F_ALU(SBC,), F_ALU(SBC,_S),
+	F_ALU(RSC,), F_ALU(RSC,_S),
+	F_ALU(ORR,), F_ALU(ORR,_S),
+	F_ALU(MOV,), F_ALU(MOV,_S),
+	F_ALU(BIC,), F_ALU(BIC,_S),
+	F_ALU(MVN,), F_ALU(MVN,_S),
+	F_ALU(TST,),
+	F_ALU(TEQ,),
+	F_ALU(CMP,),
+	F_ALU(CMN,),
+
+	F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
+	F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
+
+	F_MEM_WB(STR),
+	F_MEM_WB(STRB),
+	F_MEM_WB(LDR),
+	F_MEM_WB(LDRB),
+
+	F_MEM_HD(STRH),
+	F_MEM_HD(LDRD),
+	F_MEM_HD(STRD),
+	F_MEM_HD(LDRH),
+	F_MEM_HD(LDRSB),
+	F_MEM_HD(LDRSH),
+
+	F(SWP), F(SWPB),
+	F(LDM), F(STM),
+
+	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC),
+	NOP
+};
+#undef F_ALU
+#undef F_MEM_WB
+#undef F_MEM_HD
+#undef F
+
+void T_BL_LONG(ARM* cpu)
+{
+	ARMInterpreter::T_BL_LONG_1(cpu);
+	cpu->R[15] += 2;
+	ARMInterpreter::T_BL_LONG_2(cpu);
+}
+
+#define F(x) ARMInterpreter::T_##x
+InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
+{
+	F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
+	F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
+	F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
+	F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
+	F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
+	F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
+	F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
+	F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
+	F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
+	F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
+	F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
+	F(PUSH), F(POP), F(LDMIA), F(STMIA),
+	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
+	F(UNK), F(SVC), 
+	T_BL_LONG // BL_LONG psudo opcode
+};
+#undef F
+
+void CompileBlock(ARM* cpu)
+{
+    bool thumb = cpu->CPSR & 0x20;
+
+	if (Config::JIT_MaxBlockSize < 1)
+		Config::JIT_MaxBlockSize = 1;
+	if (Config::JIT_MaxBlockSize > 32)
+		Config::JIT_MaxBlockSize = 32;
+
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
+
+	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
+	if (!localAddr)
+	{
+		printf("trying to compile non executable code? %x\n", blockAddr);
+	}
+
+	auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7;
+	auto existingBlockIt = map.find(blockAddr);
+	if (existingBlockIt != map.end())
+	{
+		// there's already a block, though it's not inside the fast map
+		// could be that there are two blocks at the same physical addr
+		// but different mirrors
+		u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal;
+
+		if (localAddr == otherLocalAddr)
+		{
+			JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr);
+
+			u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2];
+			*entry = ((u64)blockAddr | cpu->Num) << 32;
+			*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
+			return;
+		}
+
+		// some memory has been remapped
+		JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second);
+		if (prevBlock)
+			delete prevBlock;
+		
+		map.erase(existingBlockIt);
+	}
+
+    FetchedInstr instrs[Config::JIT_MaxBlockSize];
+    int i = 0;
+    u32 r15 = cpu->R[15];
+
+	u32 addressRanges[Config::JIT_MaxBlockSize];
+	u32 addressMasks[Config::JIT_MaxBlockSize] = {0};
+	u32 numAddressRanges = 0;
+
+	u32 numLiterals = 0;
+	u32 literalLoadAddrs[Config::JIT_MaxBlockSize];
+	// they are going to be hashed
+	u32 literalValues[Config::JIT_MaxBlockSize];
+	u32 instrValues[Config::JIT_MaxBlockSize];
+
+	cpu->FillPipeline();
+    u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
+	u32 nextInstrAddr[2] = {blockAddr, r15};
+
+	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr);
+
+	u32 lastSegmentStart = blockAddr;
+	u32 lr;
+	bool hasLink = false;
+
+    do
+    {
+        r15 += thumb ? 2 : 4;
+
+		instrs[i].BranchFlags = 0;
+		instrs[i].SetFlags = 0;
+        instrs[i].Instr = nextInstr[0];
+        nextInstr[0] = nextInstr[1];
+	
+		instrs[i].Addr = nextInstrAddr[0];
+		nextInstrAddr[0] = nextInstrAddr[1];
+		nextInstrAddr[1] = r15;
+		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
+
+		instrValues[i] = instrs[i].Instr;
+
+		u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr);
+		assert(translatedAddr >> 27);
+		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
+		{
+			bool returning = false;
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (addressRanges[j] == translatedAddrRounded)
+				{
+					std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]);
+					std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]);
+					returning = true;
+					break;
+				}
+			}
+			if (!returning)
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
+		}
+		addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16);
+
+        if (cpu->Num == 0)
+        {
+            ARMv5* cpuv5 = (ARMv5*)cpu;
+            if (thumb && r15 & 0x2)
+            {
+                nextInstr[1] >>= 16;
+                instrs[i].CodeCycles = 0;
+            }
+            else
+            {
+                nextInstr[1] = cpuv5->CodeRead32(r15, false);
+                instrs[i].CodeCycles = cpu->CodeCycles;
+            }
+        }
+        else
+        {
+            ARMv4* cpuv4 = (ARMv4*)cpu;
+            if (thumb)
+                nextInstr[1] = cpuv4->CodeRead16(r15);
+            else
+                nextInstr[1] = cpuv4->CodeRead32(r15);
+            instrs[i].CodeCycles = cpu->CodeCycles;
+        }
+        instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
+
+		cpu->R[15] = r15;
+		cpu->CurInstr = instrs[i].Instr;
+		cpu->CodeCycles = instrs[i].CodeCycles;
+
+		if (instrs[i].Info.DstRegs & (1 << 14)
+			|| (!thumb
+				&& (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG)
+				&& instrs[i].Instr & (1 << 16)))
+			hasLink = false;
+
+		if (thumb)
+		{
+			InterpretTHUMB[instrs[i].Info.Kind](cpu);
+		}
+		else
+		{
+			if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+			{
+				ARMInterpreter::A_BLX_IMM(cpu);
+			}
+			else
+			{
+                u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode]
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_Nop
+					|| instrs[i].Info.Kind == ARMInstrInfo::ak_UNK);
+				if (cpu->CheckCondition(instrs[i].Cond()))
+					InterpretARM[instrs[i].Info.Kind](cpu);
+				else
+					cpu->AddCycles_C();
+			}
+		}
+
+		instrs[i].DataCycles = cpu->DataCycles;
+		instrs[i].DataRegion = cpu->DataRegion;
+
+		u32 literalAddr;
+		if (Config::JIT_LiteralOptimisations
+			&& instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral
+			&& DecodeLiteral(thumb, instrs[i], literalAddr))
+		{
+			u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr);
+			if (!translatedAddr)
+			{
+				printf("literal in non executable memory?\n");
+			}
+			u32 translatedAddrRounded = translatedAddr & ~0x1FF;
+
+			u32 j = 0;
+			for (; j < numAddressRanges; j++)
+				if (addressRanges[j] == translatedAddrRounded)
+					break;
+			if (j == numAddressRanges)
+				addressRanges[numAddressRanges++] = translatedAddrRounded;
+			addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16);
+			JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]);
+			cpu->DataRead32(literalAddr, &literalValues[numLiterals]);
+			literalLoadAddrs[numLiterals++] = translatedAddr;
+		}
+
+		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
+			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
+		{
+			instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG;
+			instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16);
+			instrs[i - 1].Info.DstRegs = 0xC000;
+			instrs[i - 1].Info.SrcRegs = 0;
+			instrs[i - 1].Info.EndBlock = true;
+			i--;
+		}
+
+		if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations)
+		{
+			bool hasBranched = cpu->R[15] != r15;
+
+			bool link;
+			u32 cond, target, linkAddr;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
+			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
+
+			if (staticBranch)
+			{
+				instrs[i].BranchFlags |= branch_StaticTarget;
+
+				bool isBackJump = false;
+				if (hasBranched)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						if (instrs[i].Addr == target)
+						{
+							isBackJump = true;
+							break;
+						}
+					}
+				}
+
+				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
+				{
+					// we might have an idle loop
+					u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
+					if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
+					{
+						instrs[i].BranchFlags |= branch_IdleBranch;
+						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
+					}
+				}
+				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				{
+					if (link)
+					{
+						lr = linkAddr;
+						hasLink = true;
+					}
+					
+					r15 = target + (thumb ? 2 : 4);
+					assert(r15 == cpu->R[15]);
+
+					JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
+
+					nextInstr[0] = cpu->NextInstr[0];
+					nextInstr[1] = cpu->NextInstr[1];
+
+					nextInstrAddr[0] = target;
+					nextInstrAddr[1] = r15;
+
+					lastSegmentStart = target;
+
+					instrs[i].Info.EndBlock = false;
+
+					if (cond < 0xE)
+						instrs[i].BranchFlags |= branch_FollowCondTaken;
+				}
+			}
+
+			if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
+			{
+				instrs[i].Info.EndBlock = false;
+				instrs[i].BranchFlags |= branch_FollowCondNotTaken;
+			}
+		}
+
+        i++;
+
+		bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
+		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
+		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
+			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
+
+	u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4);
+	u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4);
+
+	JitBlock* prevBlock = RestoreCandidates.LookUp(instrHash);
+	bool mayRestore = true;
+	if (prevBlock)
+	{
+		RestoreCandidates.Remove(instrHash);
+
+		mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash;
+
+		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
+		{
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (prevBlock->AddressRanges()[j] != addressRanges[j]
+					|| prevBlock->AddressMasks()[j] != addressMasks[j])
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
+	}
+	else
+	{
+		mayRestore = false;
+		prevBlock = NULL;
+	}
+
+	JitBlock* block;
+	if (!mayRestore)
+	{
+		if (prevBlock)
+			delete prevBlock;
+
+		block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals);
+		block->LiteralHash = literalHash;
+		block->InstrHash = instrHash;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addressRanges[j];
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressMasks()[j] = addressMasks[j];
+		for (int j = 0; j < numLiterals; j++)
+			block->Literals()[j] = literalLoadAddrs[j];
+
+		block->StartAddr = blockAddr;
+		block->StartAddrLocal = localAddr;
+
+		FloodFillSetFlags(instrs, i - 1, 0xF);
+
+		block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i);
+
+		JIT_DEBUGPRINT("block start %p\n", block->EntryPoint);
+	}
+	else
+	{
+		JIT_DEBUGPRINT("restored! %p\n", prevBlock);
+		block = prevBlock;
+	}
+
+	for (int j = 0; j < numAddressRanges; j++)
+	{
+		assert(addressRanges[j] == block->AddressRanges()[j]);
+		assert(addressMasks[j] == block->AddressMasks()[j]);
+		assert(addressMasks[j] != 0);
+
+		AddressRange* region = CodeMemRegions[addressRanges[j] >> 27];
+
+		if (!PageContainsCode(&region[(addressRanges[j] & 0x7FFF000) / 512]))
+			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true);
+
+		AddressRange* range = &region[(addressRanges[j] & 0x7FFFFFF) / 512];
+		range->Code |= addressMasks[j];
+		range->Blocks.Add(block);
+	}
+
+	if (cpu->Num == 0)
+		JitBlocks9[blockAddr] = block;
+	else
+		JitBlocks7[blockAddr] = block;
+
+	u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2];
+	*entry = ((u64)blockAddr | cpu->Num) << 32;
+	*entry |= JITCompiler->SubEntryOffset(block->EntryPoint);
+}
+
+void InvalidateByAddr(u32 localAddr)
+{
+	JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr);
+
+	AddressRange* region = CodeMemRegions[localAddr >> 27];
+	AddressRange* range = &region[(localAddr & 0x7FFFFFF) / 512];
+	u32 mask = 1 << ((localAddr & 0x1FF) / 16);
+
+	range->Code = 0;
+	for (int i = 0; i < range->Blocks.Length;)
+	{
+		JitBlock* block = range->Blocks[i];
+
+		bool invalidated = false;
+		u32 mask = 0;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			if (block->AddressRanges()[j] == (localAddr & ~0x1FF))
+			{
+				mask = block->AddressMasks()[j];
+				invalidated = block->AddressMasks()[j] & mask;
+				break;
+			}
+		}
+		assert(mask);
+		if (!invalidated)
+		{
+			range->Code |= mask;
+			i++;
+			continue;
+		}
+		range->Blocks.Remove(i);
+
+		if (range->Blocks.Length == 0
+			&& !PageContainsCode(&region[(localAddr & 0x7FFF000) / 512]))
+		{
+			ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false);
+		}
+
+		bool literalInvalidation = false;
+		for (int j = 0; j < block->NumLiterals; j++)
+		{
+			u32 addr = block->Literals()[j];
+			if (addr == localAddr)
+			{
+				if (InvalidLiterals.Find(localAddr) != -1)
+				{
+					InvalidLiterals.Add(localAddr);
+					JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length);
+				}
+				literalInvalidation = true;
+				break;
+			}
+		}
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			if ((addr / 512) != (localAddr / 512))
+			{
+				AddressRange* otherRegion = CodeMemRegions[addr >> 27];
+				AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512];
+				assert(otherRange != range);
+
+				bool removed = otherRange->Blocks.RemoveByValue(block);
+				assert(removed);
+
+				if (otherRange->Blocks.Length == 0)
+				{
+					if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512]))
+						ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false);
+
+					otherRange->Code = 0;
+				}
+			}
+		}
+
+		FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32;
+		if (block->Num == 0)
+			JitBlocks9.erase(block->StartAddr);
+		else
+			JitBlocks7.erase(block->StartAddr);
+
+		if (!literalInvalidation)
+		{
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->InstrHash, block);
+			if (prevBlock)
+				delete prevBlock;
+		}
+		else
+		{
+			delete block;
+		}
+	}
+}
+
+void CheckAndInvalidateITCM()
+{
+	for (u32 i = 0; i < ITCMPhysicalSize; i+=16)
+	{
+		if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16)))
+		{
+			InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27));
+		}
+	}
+}
+
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr)
+{
+	u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr);
+	if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
+		InvalidateByAddr(localAddr);
+}
+
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
+{
+	u64* entry = &entries[offset / 2];
+	if (*entry >> 32 == (addr | num))
+		return JITCompiler->AddEntryOffset((u32)*entry);
+	return NULL;
+}
+
+void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry)
+{
+	u32 localAddr = LocaliseCodeAddress(num, blockAddr);
+	assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry);
+}
+
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size)
+{
+	// amazingly ignoring the DTCM is the proper behaviour for code fetches
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(blockAddr)
+		: ARMJIT_Memory::ClassifyAddress7(blockAddr);
+
+	u32 memoryOffset;
+	if (FastBlockLookupRegions[region]
+		&& ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size))
+	{
+		//printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset);
+		entry = FastBlockLookupRegions[region] + memoryOffset / 2;
+		return true;
+	}
+	return false;
+}
+
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
+
+void ResetBlockCache()
+{
+	printf("Resetting JIT block cache...\n");
+
+	InvalidLiterals.Clear();
+	for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++)
+		memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2);
+	RestoreCandidates.Reset();
+	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
+	{
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValA;
+			RestoreCandidates.Table[i].ValA = NULL;
+		}
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValB;
+			RestoreCandidates.Table[i].ValB = NULL;
+		}
+	}
+	for (auto it : JitBlocks9)
+	{
+		JitBlock* block = it.second;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
+		}
+		delete block;
+	}
+	for (auto it : JitBlocks7)
+	{
+		JitBlock* block = it.second;
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
+		}
+	}
+	JitBlocks9.clear();
+	JitBlocks7.clear();
+
+	JITCompiler->Reset();
+}
+
+}
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
new file mode 100644
index 0000000..04add59
--- /dev/null
+++ b/src/ARMJIT.h
@@ -0,0 +1,37 @@
+#ifndef ARMJIT_H
+#define ARMJIT_H
+
+#include "types.h"
+
+#include "ARM.h"
+#include "ARM_InstrInfo.h"
+
+namespace ARMJIT
+{
+
+typedef void (*JitBlockEntry)();
+
+void Init();
+void DeInit();
+
+void Reset();
+
+void CheckAndInvalidateITCM();
+
+void InvalidateByAddr(u32 pseudoPhysical);
+
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr);
+
+void CompileBlock(ARM* cpu);
+
+void ResetBlockCache();
+
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr);
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size);
+
+}
+
+extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..5f021a0
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -0,0 +1,930 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+
+    UBFX(W1, rs, 0, 8);
+
+    if (!S)
+    {
+        if (op == 3)
+            RORV(W0, op2.Reg.Rm, W1);
+        else
+        {
+            CMP(W1, 32);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GE);
+                ASRV(W0, op2.Reg.Rm, W1);
+            }
+            else
+            {
+                if (op == 0)
+                    LSLV(W0, op2.Reg.Rm, W1);
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+    }
+    else
+    {
+        MOV(W0, op2.Reg.Rm);
+        FixupBranch zero = CBZ(W1);
+
+        SUB(W1, W1, 1);
+        if (op == 3)
+        {
+            RORV(W0, op2.Reg.Rm, W1);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        else
+        {
+            CMP(W1, 31);
+            if (op == 2)
+            {
+                MOVI2R(W2, 31);
+                CSEL(W1, W2, W1, CC_GT);
+                ASRV(W0, op2.Reg.Rm, W1);
+                BFI(RCPSR, W0, 29, 1);
+            }
+            else
+            {
+                if (op == 0)
+                {
+                    LSLV(W0, op2.Reg.Rm, W1);
+                    UBFX(W1, W0, 31, 1);
+                }
+                else if (op == 1)
+                    LSRV(W0, op2.Reg.Rm, W1);
+                CSEL(W1, WZR, op ? W0 : W1, CC_GT);
+                BFI(RCPSR, W1, 29, 1);
+                CSEL(W0, WZR, W0, CC_GE);
+            }
+        }
+
+        MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1));
+        SetJumpTarget(zero);
+    }
+    op2 = Op2(W0, ST_LSL, 0);
+}
+
+void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp)
+{
+    if (!(CurInstr.SetFlags & 0x2))
+        S = false;
+
+    CPSRDirty |= S;
+    
+    switch (op)
+    {
+    case 0: // LSL
+        if (S && amount)
+        {
+            UBFX(tmp, op2.Reg.Rm, 32 - amount, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSL, amount);
+        return;
+    case 1: // LSR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        if (amount == 0)
+        {
+            op2 = Op2(0);
+            return;
+        }
+        op2 = Op2(op2.Reg.Rm, ST_LSR, amount);
+        return;
+    case 2: // ASR
+        if (S)
+        {
+            UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+            BFI(RCPSR, tmp, 29, 1);
+        }
+        op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31);
+        return;
+    case 3: // ROR
+        if (amount == 0)
+        {
+            UBFX(tmp, RCPSR, 29, 1);
+            LSL(tmp, tmp, 31);
+            if (S)
+                BFI(RCPSR, op2.Reg.Rm, 29, 1);
+            ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1));
+
+            op2 = Op2(tmp, ST_LSL, 0);
+        }
+        else
+        {
+            if (S)
+            {
+                UBFX(tmp, op2.Reg.Rm, amount - 1, 1);
+                BFI(RCPSR, tmp, 29, 1);
+            }
+            op2 = Op2(op2.Reg.Rm, ST_ROR, amount);
+        }
+        return;
+    }
+}
+
+void Compiler::Comp_RetriveFlags(bool retriveCV)
+{
+    if (CurInstr.SetFlags)
+        CPSRDirty = true;
+
+    if (CurInstr.SetFlags & 0x4)
+    {
+        CSET(W0, CC_EQ);
+        BFI(RCPSR, W0, 30, 1);
+    }
+    if (CurInstr.SetFlags & 0x8)
+    {
+        CSET(W0, CC_MI);
+        BFI(RCPSR, W0, 31, 1);
+    }
+    if (retriveCV)
+    {
+        if (CurInstr.SetFlags & 0x2)
+        {
+            CSET(W0, CC_CS);
+            BFI(RCPSR, W0, 29, 1);
+        }
+        if (CurInstr.SetFlags & 0x1)
+        {
+            CSET(W0, CC_VS);
+            BFI(RCPSR, W0, 28, 1);
+        }
+    }
+}
+
+void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    switch (op)
+    {
+    case 0x0: // AND
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, op2.Imm, W0);
+            else
+                AND(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x1: // EOR
+        if (op2.IsImm)
+            EORI2R(rd, rn, op2.Imm, W0);
+        else
+            EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xC: // ORR
+        if (op2.IsImm)
+            ORRI2R(rd, rn, op2.Imm, W0);
+        else
+            ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        if (S && FlagsNZNeeded())
+            TST(rd, rd);
+        break;
+    case 0xE: // BIC
+        if (S)
+        {
+            if (op2.IsImm)
+                ANDSI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ANDI2R(rd, rn, ~op2.Imm, W0);
+            else
+                BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    if (S && !CurInstr.SetFlags)
+        S = false;
+
+    bool CVInGPR = false;
+    switch (op)
+    {
+    case 0x2: // SUB
+        if (S)
+        {
+            if (op2.IsImm)
+                SUBSI2R(rd, rn, op2.Imm, W0);
+            else
+                SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+            {
+                MOVI2R(W2, op2.Imm);
+                SUBI2R(rd, rn, op2.Imm, W0);
+            }
+            else
+                SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            op2 = Op2(WZR);
+        }
+        else if (op2.IsImm)
+        {
+            MOVI2R(W1, op2.Imm);
+            op2 = Op2(W1);
+        }
+        else if (op2.Reg.ShiftAmount != 0)
+        {
+            MOV(W1, op2.Reg.Rm, op2.ToArithOption());
+            op2 = Op2(W1);
+        }
+
+        if (S)
+            SUBS(rd, op2.Reg.Rm, rn);
+        else
+            SUB(rd, op2.Reg.Rm, rn);
+        break;
+    case 0x4: // ADD
+        if (S)
+        {
+            if (op2.IsImm)
+                ADDSI2R(rd, rn, op2.Imm, W0);
+            else
+                ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        else
+        {
+            if (op2.IsImm)
+                ADDI2R(rd, rn, op2.Imm, W0);
+            else
+                ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x5: // ADC
+        UBFX(W2, RCPSR, 29, 1);
+        if (S)
+        {
+            CVInGPR = true;
+            ADDS(W1, rn, W2);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm, W0);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, rn, W2);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm, W0);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    case 0x6: // SBC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -op2 - 1
+        if (op2.IsImm)
+            MOVI2R(W1, ~op2.Imm);
+        else
+            ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
+        if (S)
+        {
+            CVInGPR = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            ADDS(rd, rn, W1);
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            ADD(rd, rn, W1);
+        }
+        break;
+    case 0x7: // RSC
+        UBFX(W2, RCPSR, 29, 1);
+        // W1 = -rn - 1
+        MVN(W1, rn);
+        if (S)
+        {
+            CVInGPR = true;
+            ADDS(W1, W2, W1);
+            CSET(W2, CC_CS);
+            CSET(W3, CC_VS);
+            if (op2.IsImm)
+                ADDSI2R(rd, W1, op2.Imm);
+            else
+                ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+            CSINC(W2, W2, WZR, CC_CC);
+            CSINC(W3, W3, WZR, CC_VC);
+        }
+        else
+        {
+            ADD(W1, W2, W1);
+            if (op2.IsImm)
+                ADDI2R(rd, W1, op2.Imm);
+            else
+                ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+        }
+        break;
+    }
+
+    if (S)
+    {
+        if (CVInGPR)
+        {
+            BFI(RCPSR, W2, 29, 1);
+            BFI(RCPSR, W3, 28, 1);
+        }
+        Comp_RetriveFlags(!CVInGPR);
+    }
+}
+
+void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2)
+{
+    if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+    {
+        MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+        op2 = Op2(W0, ST_LSL, 0);
+    }
+
+    switch (op)
+    {
+    case 0x8: // TST
+        if (op2.IsImm)
+            TSTI2R(rn, op2.Imm, W0);
+        else
+            ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0x9: // TEQ
+        if (op2.IsImm)
+            EORI2R(W0, rn, op2.Imm, W0);
+        else
+            EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption());
+        TST(W0, W0);
+        break;
+    case 0xA: // CMP
+        if (op2.IsImm)
+            CMPI2R(rn, op2.Imm, W0);
+        else
+            CMP(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    case 0xB: // CMN
+        if (op2.IsImm)
+            ADDSI2R(WZR, rn, op2.Imm, W0);
+        else
+            CMN(rn, op2.Reg.Rm, op2.ToArithOption());
+        break;
+    }
+
+    Comp_RetriveFlags(op >= 0xA);
+}
+
+// also counts cycles!
+void Compiler::A_Comp_GetOp2(bool S, Op2& op2)
+{
+    if (CurInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));    
+    }
+    else
+    {
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        op2.Reg.Rm = MapReg(CurInstr.A_Reg(0));
+        if (CurInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+
+            ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+            if (CurInstr.A_Reg(0) == 15)
+            {
+                ADD(W0, op2.Reg.Rm, 4);
+                op2.Reg.Rm = W0;
+            }
+            Comp_RegShiftReg(op, S, op2, rs);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+
+            int amount = (CurInstr.Instr >> 7) & 0x1F;
+            Comp_RegShiftImm(op, amount, S, op2);
+        }
+    }
+}
+
+void Compiler::A_Comp_ALUCmpOp()
+{
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(op <= 0x9, op2);
+    
+    Comp_Compare(op, rn, op2);
+}
+
+void Compiler::A_Comp_ALUMovOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    Op2 op2;
+    A_Comp_GetOp2(S, op2);
+
+    if (op == 0xF) // MVN
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm);
+            MOVI2R(rd, ~op2.Imm);
+        }
+        else
+            ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption());
+    }
+    else // MOV
+    {
+        if (op2.IsImm)
+        {
+            if (CurInstr.Cond() == 0xE)
+                RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm);
+            MOVI2R(rd, op2.Imm);
+        }
+        else
+        {
+            // ORR with shifted operand has cycles latency
+            if (op2.Reg.ShiftAmount > 0)
+            {
+                switch (op2.Reg.ShiftType)
+                {
+                case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                }
+            }
+            else
+            {
+                MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            }
+        }
+    }
+
+    if (S)
+    {
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_ALUTriOp()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+    bool logical = (1 << op) & 0xF303;
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+    Op2 op2;
+    A_Comp_GetOp2(S && logical, op2);
+
+    if (op2.IsImm && op2.Imm == 0)
+        op2 = Op2(WZR, ST_LSL, 0);
+    
+    if (logical)
+        Comp_Logical(op, S, rd, rn, op2);
+    else
+        Comp_Arithmetic(op, S, rd, rn, op2);
+
+    if (CurInstr.Info.Branches())
+        Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_Clz()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+
+    CLZ(rd, rm);
+
+    assert(Num == 0);
+}
+
+void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn)
+{
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        CLS(W0, rs);
+        Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (mla)
+        MADD(rd, rm, rs, rn);
+    else
+        MUL(rd, rm, rs);
+
+    if (S && FlagsNZNeeded())
+    {
+        TST(rd, rd);
+        Comp_RetriveFlags(false);
+    }
+}
+
+void Compiler::A_Comp_Mul_Long()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
+
+    if (Num == 0)
+    {
+        Comp_AddCycles_CI(S ? 3 : 1);
+    }
+    else
+    {
+        if (sign)
+            CLS(W0, rs);
+        else
+            CLZ(W0, rs);
+        Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
+    }
+
+    if (add)
+    {
+        MOV(W0, rn);
+        BFI(X0, EncodeRegTo64(rd), 32, 32);
+        if (sign)
+            SMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        else
+            UMADDL(EncodeRegTo64(rn), rm, rs, X0);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    else
+    {
+        if (sign)
+            SMULL(EncodeRegTo64(rn), rm, rs);
+        else
+            UMULL(EncodeRegTo64(rn), rm, rs);
+        if (S && FlagsNZNeeded())
+            TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+    }
+    
+    if (S)
+        Comp_RetriveFlags(false);
+}
+
+void Compiler::A_Comp_Mul_Short()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    bool x = CurInstr.Instr & (1 << 5);
+    bool y = CurInstr.Instr & (1 << 6);
+
+    SBFX(W1, rs, y ? 16 : 0, 16);
+
+    if (op == 0b1000)
+    {
+        // SMLAxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(W0, W0, W1);
+
+        ORRI2R(W1, RCPSR, 0x08000000);
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+        ADDS(rd, W0, rn);
+
+        CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+        CPSRDirty = true;
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1011)
+    {
+        // SMULxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(rd, W0, W1);
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1010)
+    {
+        // SMLALxy
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+        MOV(W2, rn);
+        BFI(X2, rd, 32, 32);
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        SMADDL(EncodeRegTo64(rn), W0, W1, X2);
+
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+
+        Comp_AddCycles_CI(1);
+    }
+    else if (op == 0b1001)
+    {
+        // SMLAWy/SMULWy
+        SMULL(X0, rm, W1);
+        ASR(x ? EncodeRegTo64(rd) : X0, X0, 16);
+
+        if (!x)
+        {
+            ORRI2R(W1, RCPSR, 0x08000000);
+
+            ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+            ADDS(rd, W0, rn);
+
+            CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+            CPSRDirty = true;
+        }
+
+        Comp_AddCycles_C();
+    }
+}
+
+void Compiler::A_Comp_Mul()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+
+    bool S = CurInstr.Instr & (1 << 20);
+    bool mla = CurInstr.Instr & (1 << 21);
+    ARM64Reg rn = INVALID_REG;
+    if (mla)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_Mul_Mla(S, mla, rd, rm, rs, rn);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    Comp_AddCycles_C();
+
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    Op2 op2;
+    op2.Reg.Rm = MapReg(CurInstr.T_Reg(3));
+    Comp_RegShiftImm(op, amount, true, op2);
+    if (op2.IsImm)
+        MOVI2R(rd, op2.Imm);
+    else
+        MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+    if (FlagsNZNeeded())
+        TST(rd, rd);
+
+    Comp_RetriveFlags(false);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    Comp_AddCycles_C();
+
+    Op2 op2;
+    if (CurInstr.Instr & (1 << 10))
+        op2 = Op2((CurInstr.Instr >> 6) & 0x7);
+    else
+        op2 = Op2(MapReg(CurInstr.T_Reg(6)));
+    
+    Comp_Arithmetic(
+        CurInstr.Instr & (1 << 9) ? 0x2 : 0x4,
+        true,
+        MapReg(CurInstr.T_Reg(0)),
+        MapReg(CurInstr.T_Reg(3)),
+        op2);
+}
+
+void Compiler::T_Comp_ALUImm8()
+{
+    Comp_AddCycles_C();
+
+    u32 imm = CurInstr.Instr & 0xFF;
+    int op = (CurInstr.Instr >> 11) & 0x3;
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+
+    switch (op)
+    {
+    case 0:
+        MOVI2R(rd, imm);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    case 1:
+        Comp_Compare(0xA, rd, Op2(imm));
+        break;
+    case 2:
+    case 3:
+        Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm));
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU()
+{
+    int op = (CurInstr.Instr >> 6) & 0xF;
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.T_Reg(3));
+    
+    if ((op >= 0x2 && op <= 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1);
+    else
+        Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0:
+        Comp_Logical(0x0, true, rd, rd, Op2(rs));
+        break;
+    case 0x1:
+        Comp_Logical(0x1, true, rd, rd, Op2(rs));
+        break;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {   
+            Op2 op2;
+            op2.Reg.Rm = rd;
+            Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs);
+            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            if (FlagsNZNeeded())
+                TST(rd, rd);
+            Comp_RetriveFlags(false);
+        }
+        break;
+    case 0x5:
+        Comp_Arithmetic(0x5, true, rd, rd, Op2(rs));
+        break;
+    case 0x6:
+        Comp_Arithmetic(0x6, true, rd, rd, Op2(rs));
+        break;
+    case 0x8:
+        Comp_Compare(0x8, rd, Op2(rs));
+        break;
+    case 0x9:
+        Comp_Arithmetic(0x3, true, rd, rs, Op2(0));
+        break;
+    case 0xA:
+        Comp_Compare(0xA, rd, Op2(rs));
+        break;
+    case 0xB:
+        Comp_Compare(0xB, rd, Op2(rs));
+        break;
+    case 0xC:
+        Comp_Logical(0xC, true, rd, rd, Op2(rs));
+        break;
+    case 0xD:
+        Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG);
+        break;
+    case 0xE:
+        Comp_Logical(0xE, true, rd, rd, Op2(rs));
+        break;
+    case 0xF:
+        MVN(rd, rs);
+        if (FlagsNZNeeded())
+            TST(rd, rd);
+        Comp_RetriveFlags(false);
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0:
+        Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs));
+        break;
+    case 1:
+        Comp_Compare(0xA, rdMapped, rs);
+        return;
+    case 2:
+        MOV(rdMapped, rs);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        Comp_JumpTo(rdMapped, false, false);
+    }
+}
+
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg sp = MapReg(13);
+    u32 offset = (CurInstr.Instr & 0x7F) << 2;
+    if (CurInstr.Instr & (1 << 7))
+        SUB(sp, sp, offset);
+    else
+        ADD(sp, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        ARM64Reg sp = MapReg(13);
+        ADD(rd, sp, offset);
+    }
+    else
+        MOVI2R(rd, (R15 & ~2) + offset);
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..f130938
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -0,0 +1,421 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+// hack
+const int kCodeCacheTiming = 3;
+
+namespace ARMJIT
+{
+
+template <typename T>
+void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR)
+{
+    cpu->JumpTo(addr, changeCPSR);
+}
+
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    // it's not completely safe to assume stuff like, which instructions to preload
+    // we'll see how it works out
+
+    IrregularCycles = true;
+
+    u32 newPC;
+    u32 cycles = 0;
+    bool setupRegion = false;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        ORRI2R(RCPSR, RCPSR, 0x20);
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        ANDI2R(RCPSR, RCPSR, ~0x20);
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 oldregion = R15 >> 24;
+        u32 newregion = addr >> 24;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        MOVI2R(W0, regionCodeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+        setupRegion = newregion != oldregion;
+        if (setupRegion)
+            cpu9->SetupCodeMem(addr);
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                cpu9->CodeRead32(addr-2, true) >> 16;
+                cycles += cpu9->CodeCycles;
+                cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                cpu9->CodeRead32(addr, true);
+                cycles += cpu9->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+        if (setupRegion)
+            cpu9->SetupCodeMem(R15);
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        MOVI2R(W0, codeRegion);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion));
+        MOVI2R(W0, codeCycles);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
+    }
+
+    if (Exit)
+    {
+        MOVI2R(W0, newPC);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+    }
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        SUB(RCycles, RCycles, cycles);
+}
+
+
+void* Compiler::Gen_JumpTo9(int kind)
+{
+    AlignCode16();
+    void* res = GetRXPtr();
+
+    LSR(W1, W0, 12);
+    ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
+    LDRB(W1, RCPU, W1);
+
+    LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize));
+
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+    CMP(W1, 0xFF);
+    MOVI2R(W3, kCodeCacheTiming);
+    CSEL(W1, W3, W1, CC_EQ);
+    CMP(W0, W2);
+    CSINC(W1, W1, WZR, CC_HS);
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+
+    if (kind == 0 || kind == 1)
+    {
+        // ARM
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ANDI2R(W0, W0, ~3);
+        ADD(W0, W0, 4);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
+
+        ADD(W1, W1, W1);
+        SUB(RCycles, RCycles, W1);
+        RET();
+    }
+
+    if (kind == 0 || kind == 2)
+    {
+        // Thumb
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        ANDI2R(W0, W0, ~1);
+        ADD(W0, W0, 2);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
+
+        ADD(W2, W1, W1);
+        TSTI2R(W0, 0x2);
+        CSEL(W1, W1, W2, CC_EQ);
+        SUB(RCycles, RCycles, W1);
+        RET();
+    }
+
+    return res;
+}
+
+void* Compiler::Gen_JumpTo7(int kind)
+{
+    void* res = GetRXPtr();
+
+    LSR(W1, W0, 24);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion));
+    LSR(W1, W0, 15);
+    STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles));
+
+    MOVP2R(X2, NDS::ARM7MemTimings);
+    LDR(W3, X2, ArithOption(W1, true));
+
+    FixupBranch switchToThumb;
+    if (kind == 0)
+        switchToThumb = TBNZ(W0, 0);
+    
+    if (kind == 0 || kind == 1)
+    {
+        UBFX(W2, W3, 0, 8);
+        UBFX(W3, W3, 8, 8);
+        ADD(W2, W3, W2);
+        SUB(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~3);
+
+        if (kind == 0)
+            ANDI2R(RCPSR, RCPSR, ~0x20);
+
+        ADD(W3, W0, 4);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+    if (kind == 0 || kind == 2)
+    {
+        if (kind == 0)
+        {
+            SetJumpTarget(switchToThumb);
+
+            ORRI2R(RCPSR, RCPSR, 0x20);
+        }
+
+        UBFX(W2, W3, 16, 8);
+        UBFX(W3, W3, 24, 8);
+        ADD(W2, W3, W2);
+        SUB(RCycles, RCycles, W2);
+
+        ANDI2R(W0, W0, ~1);
+
+        ADD(W3, W0, 2);
+        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+        RET();
+    }
+
+    return res;
+}
+
+void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR)
+{
+    IrregularCycles = true;
+
+    if (!restoreCPSR)
+    {
+        if (switchThumb)
+            CPSRDirty = true;
+        MOV(W0, addr);
+        BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]);
+    }
+    else
+    {
+        
+        bool cpsrDirty = CPSRDirty;
+        SaveCPSR();
+        SaveCycles();
+        PushRegs(restoreCPSR);
+
+        if (switchThumb)
+            MOV(W1, addr);
+        else
+        {
+            if (Thumb)
+                ORRI2R(W1, addr, 1);
+            else
+                ANDI2R(W1, addr, ~1);
+        }
+        MOV(X0, RCPU);
+        MOVI2R(W2, restoreCPSR);
+        if (Num == 0)
+            QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
+        else
+            QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
+
+        PopRegs(restoreCPSR);
+        LoadCycles();
+        LoadCPSR();
+        if (CurInstr.Cond() < 0xE)
+            CPSRDirty = cpsrDirty;
+    }
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOVI2R(MapReg(14), R15 - 4);
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(W0, rn);
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOVI2R(MapReg(14), R15 - 4);
+    Comp_JumpTo(W0, true);
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    Comp_BranchSpecialBehaviour(true);
+
+    FixupBranch skipFailed = B();
+    SetJumpTarget(skipExecute);
+    Comp_AddCycles_C(true);
+
+    Comp_BranchSpecialBehaviour(false);
+
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+
+    if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(W0, MapReg(CurInstr.A_Reg(3)));
+        MOVI2R(MapReg(14), R15 - 1);
+        Comp_JumpTo(W0, true);
+    }
+    else
+    {
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn, true);
+    }
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOVI2R(MapReg(14), R15 + offset);
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    ARM64Reg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    ADD(W0, lr, offset);
+    MOVI2R(lr, (R15 - 2) | 1);
+    Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12)));
+}
+
+void Compiler::T_Comp_BL_Merged()
+{
+    Comp_AddCycles_C();
+
+    R15 += 2;
+
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
+        target |= 1;
+
+    MOVI2R(MapReg(14), (R15 - 2) | 1);
+    
+    Comp_JumpTo(target);
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..42435ed
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -0,0 +1,884 @@
+#ifdef __SWITCH__
+#include "../switch/compat_switch.h"
+
+extern char __start__;
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMInterpreter.h"
+#include "../Config.h"
+
+#include <malloc.h>
+
+using namespace Arm64Gen;
+
+extern "C" void ARM_Ret();
+
+namespace ARMJIT
+{
+
+/*
+
+    Recompiling classic ARM to ARMv8 code is at the same time
+    easier and trickier than compiling to a less related architecture
+    like x64. At one hand you can translate a lot of instructions directly.
+    But at the same time, there are a ton of exceptions, like for
+    example ADD and SUB can't have a RORed second operand on ARMv8.
+ 
+    While writing a JIT when an instruction is recompiled into multiple ones
+    not to write back until you've read all the other operands!
+*/
+
+template <>
+const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
+    {W19, W20, W21, W22, W23, W24, W25, W26};
+template <>
+const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
+
+const int JitMemSize = 16 * 1024 * 1024;
+#ifndef __SWITCH__
+u8 JitMem[JitMemSize];
+#endif
+
+void Compiler::MovePC()
+{
+    ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
+}
+
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+        MOV(rd, W3);
+    }
+    else
+        MOV(rd, RCPSR);
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg val;
+    if (CurInstr.Instr & (1 << 25))
+    {
+        val = W0;
+        MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)));
+    }
+    else
+    {
+        val = MapReg(CurInstr.A_Reg(0));
+    }
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+
+        MOVI2R(W1, mask);
+        MOVI2R(W2, mask & 0xFFFFFF00);
+        ANDI2R(W5, RCPSR, 0x1F);
+        CMP(W5, 0x10);
+        CSEL(W1, W2, W1, CC_EQ);
+
+        BIC(W3, W3, W1);
+        AND(W0, val, W1);
+        ORR(W3, W3, W0);
+
+        MOVI2R(W1, 15 - 8);
+
+        BL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            ANDI2R(RCPSR, RCPSR, ~mask);
+            ANDI2R(W0, val, mask);
+            ORR(RCPSR, RCPSR, W0);
+        }
+        else
+        {
+            MOVI2R(W2, mask);
+            MOVI2R(W3, mask & 0xFFFFFF00);
+            ANDI2R(W1, RCPSR, 0x1F);
+            // W1 = first argument
+            CMP(W1, 0x10);
+            CSEL(W2, W3, W2, CC_EQ);
+
+            BIC(RCPSR, RCPSR, W2);
+            AND(W0, val, W2);
+            ORR(RCPSR, RCPSR, W0);
+
+            MOV(W2, RCPSR);
+            MOV(X0, RCPU);
+
+            PushRegs(true);
+
+            QuickCallFunction(X3, (void*)&ARM::UpdateMode);
+        
+            PopRegs(true);
+        }
+    }
+}
+
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        if (Thumb || CurInstr.Cond() == 0xE)
+        {
+            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsLoaded)
+                RegCache.UnloadRegister(reg);
+        }
+        else
+        {
+            BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsDirty)
+                SaveReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+
+        for (int reg : hiRegsLoaded)
+            LoadReg(reg, RegCache.Mapping[reg]);
+    }
+}
+
+Compiler::Compiler()
+{
+#ifdef __SWITCH__
+    JitRWBase = memalign(0x1000, JitMemSize);
+
+    JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000;
+    JitRWStart = virtmemReserve(JitMemSize);
+    MemoryInfo info = {0};
+    u32 pageInfo = {0};
+    int i = 0;
+    while (JitRXStart != NULL)
+    {
+        svcQueryMemory(&info, &pageInfo, (u64)JitRXStart);
+        if (info.type != MemType_Unmapped)
+            JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000);
+        else
+            break;
+        if (i++ > 8)
+        {
+            printf("couldn't find unmapped place for jit memory\n");
+            JitRXStart = NULL;
+        }
+    }
+
+    assert(JitRXStart != NULL);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx));
+    assert(succeded);
+    succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+    assert(succeded);
+
+    SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
+    JitMemMainSize = JitMemSize;
+#else
+    u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
+    u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned;
+    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+
+    SetCodeBase(pageAligned, pageAligned);
+    JitMemUseableSize = alignedSize;
+#endif
+    SetCodePtr(0);
+
+    for (int i = 0; i < 3; i++)
+    {
+        JumpToFuncs9[i] = Gen_JumpTo9(i);
+        JumpToFuncs7[i] = Gen_JumpTo7(i);
+    }
+
+    /*
+        W5 - mode
+        W1 - reg num
+        W3 - in/out value of reg
+    */
+    {
+        ReadBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W5, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W5, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W5, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W5, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        RET();
+        SetJumpTarget(irq);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        RET();
+        SetJumpTarget(svc);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        RET();
+        SetJumpTarget(abt);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        RET();
+        SetJumpTarget(und);
+        LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        RET();
+    }
+    {
+        WriteBanked = GetRXPtr();
+
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
+        FixupBranch fiq = B(CC_EQ);
+        SUBS(W1, W1, 13 - 8);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        FixupBranch notEverything = B(CC_LT);
+        CMP(W5, 0x12);
+        FixupBranch irq = B(CC_EQ);
+        CMP(W5, 0x13);
+        FixupBranch svc = B(CC_EQ);
+        CMP(W5, 0x17);
+        FixupBranch abt = B(CC_EQ);
+        CMP(W5, 0x1B);
+        FixupBranch und = B(CC_EQ);
+        SetJumpTarget(notEverything);
+        MOVI2R(W4, 0);
+        RET();
+
+        SetJumpTarget(fiq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(irq);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(svc);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(abt);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+        MOVI2R(W4, 1);
+        RET();
+        SetJumpTarget(und);
+        STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+        MOVI2R(W4, 1);
+        RET();
+    }
+
+    for (int num = 0; num < 2; num++)
+    {
+        for (int size = 0; size < 3; size++)
+        {
+            for (int reg = 0; reg < 8; reg++)
+            {
+                ARM64Reg rdMapped = (ARM64Reg)(W19 + reg);
+                PatchedStoreFuncs[num][size][reg] = GetRXPtr();
+                if (num == 0)
+                {
+                    MOV(X1, RCPU);
+                    MOV(W2, rdMapped);
+                }
+                else
+                {
+                    MOV(W1, rdMapped);
+                }
+                ABI_PushRegisters({30});
+                switch ((8 << size) |  num)
+                {
+                case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                case 33: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                case 17: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                case 9: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                }
+                ABI_PopRegisters({30});
+                RET();
+
+                for (int signextend = 0; signextend < 2; signextend++)
+                {
+                    PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr();
+                    if (num == 0)
+                        MOV(X1, RCPU);
+                    ABI_PushRegisters({30});
+                    switch ((8 << size) |  num)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 33: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 17: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    case 9: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                    ABI_PopRegisters({30});
+                    if (size == 32)
+                        MOV(rdMapped, W0);
+                    else if (signextend)
+                        SBFX(rdMapped, W0, 0, 8 << size);
+                    else
+                        UBFX(rdMapped, W0, 0, 8 << size);
+                    RET();
+                }
+            }
+        }
+    }
+
+    FlushIcache();
+
+    JitMemSecondarySize = 1024*1024*4;
+
+    JitMemMainSize -= GetCodeOffset();
+    JitMemMainSize -= JitMemSecondarySize;
+
+    SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
+}
+
+Compiler::~Compiler()
+{
+#ifdef __SWITCH__
+    if (JitRWStart != NULL)
+    {
+        bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+        assert(succeded);
+        virtmemFree(JitRWStart, JitMemSize);
+        succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+        assert(succeded);
+        free(JitRWBase);
+    }
+#endif
+}
+
+void Compiler::LoadCycles()
+{
+    LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
+void Compiler::SaveCycles()
+{
+    STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
+void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
+{
+    if (reg == 15)
+        MOVI2R(nativeReg, R15);
+    else
+        LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::SaveReg(int reg, ARM64Reg nativeReg)
+{
+    STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+    LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+}
+
+void Compiler::SaveCPSR(bool markClean)
+{
+    if (CPSRDirty)
+    {
+        STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+        CPSRDirty = CPSRDirty && !markClean;
+    }
+}
+
+FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    if (cond >= 0x8)
+    {
+        LSR(W1, RCPSR, 28);
+        MOVI2R(W2, 1);
+        LSLV(W2, W2, W1);
+        ANDI2R(W2, W2, ARM::ConditionTable[cond], W3);
+
+        return CBZ(W2);
+    }
+    else
+    {
+        u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)));
+
+        if (cond & 1)
+            return TBNZ(RCPSR, bit);
+        else
+            return TBZ(RCPSR, bit);
+    }
+}
+
+#define F(x) &Compiler::A_Comp_##x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // EOR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SUB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSB
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADD
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ADC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // SBC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // RSC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // ORR
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MOV
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // BIC
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+    // MVN
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+    // TST
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // TEQ
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMP
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // CMN
+    F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+    // Mul
+    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short),
+    // ARMv5 exclusives
+    F(Clz), NULL, NULL, NULL, NULL, 
+    
+    // STR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDR
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // LDRB
+    F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+    // STRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRD
+    NULL, NULL, NULL, NULL,
+    // STRD
+    NULL, NULL, NULL, NULL,
+    // LDRH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSB
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // LDRSH
+    F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+    // Swap
+    NULL, NULL,
+    // LDM, STM
+    F(LDM_STM), F(LDM_STM),
+    // Branch
+    F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
+    // Special
+    NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL,
+    &Compiler::Nop
+};
+#undef F
+#define F(x) &Compiler::T_Comp_##x
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] =
+{
+    // Shift imm
+    F(ShiftImm), F(ShiftImm), F(ShiftImm),
+    // Add/sub tri operand
+    F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_),
+    // 8 bit imm
+    F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8),
+    // ALU
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+    // ALU hi reg
+    F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg),
+    // PC/SP relative ops
+    F(RelAddr), F(RelAddr), F(AddSP),
+    // LDR PC rel
+    F(LoadPCRel),
+    // LDR/STR reg offset
+    F(MemReg), F(MemReg), F(MemReg), F(MemReg),
+    // LDR/STR sign extended, half
+    F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf),
+    // LDR/STR imm offset
+    F(MemImm), F(MemImm), F(MemImm), F(MemImm),
+    // LDR/STR half imm offset
+    F(MemImmHalf), F(MemImmHalf),
+    // LDR/STR sp rel
+    F(MemSPRel), F(MemSPRel),
+    // PUSH/POP
+    F(PUSH_POP), F(PUSH_POP),
+    // LDMIA, STMIA
+    F(LDMIA_STMIA), F(LDMIA_STMIA),
+    // Branch
+    F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2),
+    // Unk, SVC
+    NULL, NULL,
+    F(BL_Merged)
+};
+
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
+void Compiler::Comp_BranchSpecialBehaviour(bool taken)
+{
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+    {
+        MOVI2R(W0, 1);
+        STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
+    }
+
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
+    {
+        RegCache.PrepareExit();
+
+        SUB(RCycles, RCycles, ConstantCycles);
+        QuickTailCall(X0, ARM_Ret);
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+{
+    if (JitMemMainSize - GetCodeOffset() < 1024 * 16)
+    {
+        printf("JIT near memory full, resetting...\n");
+        ResetBlockCache();
+    }
+    if ((JitMemMainSize +  JitMemSecondarySize) - OtherCodeRegion < 1024 * 8)
+    {
+        printf("JIT far memory full, resetting...\n");
+        ResetBlockCache();
+    }
+
+    JitBlockEntry res = (JitBlockEntry)GetRXPtr();
+
+    Thumb = thumb;
+    Num = cpu->Num;
+    CurCPU = cpu;
+    ConstantCycles = 0;
+    RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
+    CPSRDirty = false;
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
+
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
+        Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
+        //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags);
+
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        {
+            MOVI2R(W0, R15);
+            STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+            if (comp == NULL)
+            {
+                MOVI2R(W0, CurInstr.Instr);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr));
+            }
+            if (Num == 0)
+            {
+                MOVI2R(W0, (s32)CurInstr.CodeCycles);
+                STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+            }
+        }
+
+        if (comp == NULL)
+        {
+            SaveCycles();
+            SaveCPSR();
+            RegCache.Flush();
+        }
+        else
+            RegCache.Prepare(Thumb, i);
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(X0, RCPU);
+                QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]);
+            }
+            else
+                (this->*comp)();
+        }
+        else
+        {
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM);
+                }
+            }
+            else if (cond == 0xF)
+                Comp_AddCycles_C();
+            else
+            {
+                IrregularCycles = false;
+
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                    skipExecute = CheckCondition(cond);
+
+                if (comp == NULL)
+                {
+                    MOV(X0, RCPU);
+                    QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]);
+                }
+                else
+                {
+                    (this->*comp)();
+                }
+
+                Comp_BranchSpecialBehaviour(true);
+
+                if (cond < 0xE)
+                {
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
+                    {
+                        FixupBranch skipNop = B();
+                        SetJumpTarget(skipExecute);
+
+                        Comp_AddCycles_C();
+
+                        Comp_BranchSpecialBehaviour(false);
+
+                        SetJumpTarget(skipNop);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
+                }
+
+            }
+        }
+
+        if (comp == NULL)
+        {
+            LoadCycles();
+            LoadCPSR();
+        }
+    }
+
+    RegCache.Flush();
+
+    SUB(RCycles, RCycles, ConstantCycles);
+    QuickTailCall(X0, ARM_Ret);
+
+    FlushIcache();
+
+    return res;
+}
+
+void Compiler::Reset()
+{
+    LoadStorePatches.clear();
+
+    SetCodePtr(0);
+    OtherCodeRegion = JitMemMainSize;
+
+    const u32 brk_0 = 0xD4200000;
+
+    for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++)
+        *(((u32*)GetRWPtr()) + i) = brk_0;
+}
+
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+
+    if (forceNonConstant)
+        ConstantCycles += cycles;
+    else
+        SUB(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 numI)
+{
+    IrregularCycles = true;
+
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
+
+    if (Thumb || CurInstr.Cond() == 0xE)
+        ConstantCycles += cycles;
+    else
+        SUB(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
+{
+    IrregularCycles = true;
+
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
+
+    SUB(RCycles, RCycles, cycles);
+    if (Thumb || CurInstr.Cond() >= 0xE)
+        ConstantCycles += cycles;
+    else
+        SUB(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            SUB(RCycles, RCycles, cycles);
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if ((CurInstr.DataRegion >> 24) == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
+        SUB(RCycles, RCycles, cycles);
+    else
+        ConstantCycles += cycles;
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..e4ffc63
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -0,0 +1,269 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../ARM.h"
+#include "../ARMJIT.h"
+
+#include "../dolphin/Arm64Emitter.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMJIT_RegisterCache.h"
+
+#include <unordered_map>
+
+namespace ARMJIT
+{
+
+const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27;
+const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28;
+const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29;
+
+struct Op2
+{
+    Op2()
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = Arm64Gen::ST_LSL;
+        Reg.ShiftAmount = 0;
+    }
+
+    Op2(u32 imm) : IsImm(true), Imm(imm)
+    {}
+
+    Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false)
+    {
+        Reg.Rm = rm;
+        Reg.ShiftType = st;
+        Reg.ShiftAmount = amount;
+    }
+
+    Arm64Gen::ArithOption ToArithOption()
+    {
+        assert(!IsImm);
+        return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount);
+    }
+
+    bool IsSimpleReg()
+    { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; }
+    bool ImmFits12Bit()
+    { return IsImm && (Imm & 0xFFF == Imm); }
+    bool IsZero()
+    { return IsImm && !Imm; }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            Arm64Gen::ARM64Reg Rm;
+            Arm64Gen::ShiftType ShiftType;
+            int ShiftAmount;
+        } Reg;
+        u32 Imm;
+    };
+};
+
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s32 PatchOffset;
+    u32 PatchSize;
+};
+
+class Compiler : public Arm64Gen::ARM64XEmitter
+{
+public:
+    typedef void (Compiler::*CompileFunc)();
+
+    Compiler();
+    ~Compiler();
+
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
+    Arm64Gen::ARM64Reg MapReg(int reg)
+    {
+        assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
+        return RegCache.Mapping[reg];
+    }
+
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+
+    bool CanCompile(bool thumb, u16 kind);
+
+    bool FlagsNZNeeded()
+    {
+        return CurInstr.SetFlags & 0xC;
+    }
+
+    void Reset();
+
+    void Comp_AddCycles_C(bool forceNonConstant = false);
+    void Comp_AddCycles_CI(u32 numI);
+    void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
+    void Comp_AddCycles_CD();
+    void Comp_AddCycles_CDI();
+
+    void MovePC();
+
+    void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+    void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+
+    void LoadCPSR();
+    void SaveCPSR(bool markClean = true);
+
+    void LoadCycles();
+    void SaveCycles();
+
+    void Nop() {}
+
+    void A_Comp_ALUTriOp();
+    void A_Comp_ALUMovOp();
+    void A_Comp_ALUCmpOp();
+
+    void A_Comp_Mul();
+    void A_Comp_Mul_Long();
+    void A_Comp_Mul_Short();
+
+    void A_Comp_Clz();
+
+    void A_Comp_MemWB();
+    void A_Comp_MemHD();
+
+    void A_Comp_LDM_STM();
+    
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
+    void A_Comp_MRS();
+    void A_Comp_MSR();
+
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALUImm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
+    void T_Comp_AddSP();
+    void T_Comp_RelAddr();
+
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+
+    void T_Comp_LDMIA_STMIA();
+    void T_Comp_PUSH_POP();
+
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged();
+
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+
+    void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn);
+
+    void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+    void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+
+    void Comp_RetriveFlags(bool retriveCV);
+
+    Arm64Gen::FixupBranch CheckCondition(u32 cond);
+
+    void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
+
+    void A_Comp_GetOp2(bool S, Op2& op2);
+
+    void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
+    void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
+
+    bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
+
+    // 0 = switch mode, 1 = stay arm, 2 = stay thumb
+    void* Gen_JumpTo9(int kind);
+    void* Gen_JumpTo7(int kind);
+
+    void Comp_BranchSpecialBehaviour(bool taken);
+
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(GetRXBase() + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - GetRXBase();
+    }
+
+    bool IsJITFault(u64 pc);
+    s64 RewriteMemAccess(u64 pc);
+
+    void SwapCodeRegion()
+    {
+        ptrdiff_t offset = GetCodeOffset();
+        SetCodePtrUnsafe(OtherCodeRegion);
+        OtherCodeRegion = offset;
+    }
+
+    ptrdiff_t OtherCodeRegion;
+
+    bool Exit;
+
+    FetchedInstr CurInstr;
+    bool Thumb;
+    u32 R15;
+    u32 Num;
+    ARM* CurCPU;
+    u32 ConstantCycles;
+    u32 CodeRegion;
+
+    BitSet32 SavedRegs;
+
+    u32 JitMemSecondarySize;
+    u32 JitMemMainSize;
+
+    void* ReadBanked, *WriteBanked;
+
+    void* JumpToFuncs9[3];
+    void* JumpToFuncs7[3];
+
+    std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches; 
+
+    // [Num][Size][Sign Extend][Output register]
+    void* PatchedLoadFuncs[2][3][2][8];
+    void* PatchedStoreFuncs[2][3][8];
+
+    RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
+
+    bool CPSRDirty = false;
+
+    bool IrregularCycles = false;
+
+#ifdef __SWITCH__
+    void* JitRWBase;
+    void* JitRWStart;
+    void* JitRXStart;
+#endif
+};
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..536a478
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Linkage.s
@@ -0,0 +1,68 @@
+#include "../ARMJIT_x64/ARMJIT_Offsets.h"
+
+.text
+
+#define RCPSR W27
+#define RCycles W28
+#define RCPU X29
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+    stp x19, x20, [sp, #-96]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+    stp x25, x26, [sp, #48]
+    stp x27, x28, [sp, #64]
+    stp x29, x30, [sp, #80]
+
+    mov RCPU, x0
+    ldr RCycles, [RCPU, ARM_Cycles_offset]
+    ldr RCPSR, [RCPU, ARM_CPSR_offset]
+
+    br x1
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    str RCycles, [RCPU, ARM_Cycles_offset]
+    str RCPSR, [RCPU, ARM_CPSR_offset]
+
+    ldp x29, x30, [sp, #80]
+    ldp x27, x28, [sp, #64]
+    ldp x25, x26, [sp, #48]
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #96
+
+    ret
+
+.p2align 4,,15
+
+.global ARM_RestoreContext
+ARM_RestoreContext:
+    mov sp, x0
+
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
+    ldp x8, x9, [sp, #64]
+    ldp x10, x11, [sp, #80]
+    ldp x12, x13, [sp, #96]
+    ldp x14, x15, [sp, #112]
+    ldp x16, x17, [sp, #128]
+    ldp x18, x19, [sp, #144]
+    ldp x20, x21, [sp, #160]
+    ldp x22, x23, [sp, #176]
+    ldp x24, x25, [sp, #192]
+    ldp x26, x27, [sp, #208]
+    ldp x28, x29, [sp, #224]
+    ldr x30, [sp, #240]
+
+    ldp x17, x18, [sp, #248]
+    mov sp, x17
+
+    br x18
+\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..c1b23a7
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,794 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../Config.h"
+
+#include "../ARMJIT_Memory.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+bool Compiler::IsJITFault(u64 pc)
+{
+    return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize);
+}
+
+s64 Compiler::RewriteMemAccess(u64 pc)
+{
+    ptrdiff_t pcOffset = pc - (u64)GetRXBase();
+
+    auto it = LoadStorePatches.find(pcOffset);
+
+    if (it != LoadStorePatches.end())
+    {
+        LoadStorePatch patch = it->second;
+
+        ptrdiff_t curCodeOffset = GetCodeOffset();
+
+        SetCodePtrUnsafe(pcOffset + patch.PatchOffset);
+
+        BL(patch.PatchFunc);
+
+        for (int i = 0; i < patch.PatchSize / 4 - 1; i++)
+            HINT(HINT_NOP);
+
+        FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr());
+
+        SetCodePtrUnsafe(curCodeOffset);
+
+        LoadStorePatches.erase(it);
+
+        return patch.PatchOffset;
+    }
+    printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc));
+    assert(false);
+}
+
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
+{
+    u32 localAddr = LocaliseCodeAddress(Num, addr);
+
+    int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
+    if (invalidLiteralIdx != -1)
+    {
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
+    }
+
+    Comp_AddCycles_CDI();
+
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+    {
+        CurCPU->DataRead16(addr & ~0x1, &val);
+        if (signExtend)
+            val = ((s32)val << 16) >> 16;
+    }
+    else
+    {
+        CurCPU->DataRead8(addr, &val);
+        if (signExtend)
+            val = ((s32)val << 24) >> 24;
+    }
+    CurCPU->R[15] = tmpR15;
+
+    MOVI2R(MapReg(rd), val);
+
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+    
+    return true;
+}
+
+void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
+{
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        
+        if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
+            return;
+    }
+    
+    if (flags & memop_Store)
+        Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
+
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rnMapped = MapReg(rn);
+
+    if (Thumb && rn == 15)
+    {
+        ANDI2R(W3, rnMapped, ~2);
+        rnMapped = W3;
+    }
+
+    ARM64Reg finalAddr = W0;
+    if (flags & memop_Post)
+    {
+        finalAddr = rnMapped;
+        MOV(W0, rnMapped);
+    }
+
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+    if (!offset.IsImm)
+        Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+    // offset might has become an immediate
+    if (offset.IsImm)
+    {
+        if (offset.Imm)
+        {
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Imm);
+            else
+                ADD(finalAddr, rnMapped, offset.Imm);
+        }
+        else if (finalAddr != rnMapped)
+            MOV(finalAddr, rnMapped);
+    }
+    else
+    {
+        if (offset.Reg.ShiftType == ST_ROR)
+        {
+            ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+            offset = Op2(W0);
+        }
+
+        if (flags & memop_SubtractOffset)
+            SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+        else
+            ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+    }
+
+    if (!(flags & memop_Post) && (flags & memop_Writeback))
+        MOV(rnMapped, W0);
+
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
+
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
+    {
+        ptrdiff_t memopStart = GetCodeOffset();
+        LoadStorePatch patch;
+
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19]
+            : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19];
+        assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8);
+
+        MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+
+        // take a chance at fastmem
+        if (size > 8)
+            ANDI2R(W1, W0, addressMask);
+        
+        ptrdiff_t loadStorePosition = GetCodeOffset();
+        if (flags & memop_Store)
+        {
+            STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7);
+        }
+        else
+        {
+            LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7);
+            if (size == 32)
+            {
+                UBFIZ(W0, W0, 3, 2);
+                RORV(rdMapped, rdMapped, W0);
+            }
+        }
+
+        patch.PatchOffset = memopStart - loadStorePosition;
+        patch.PatchSize = GetCodeOffset() - memopStart;
+        LoadStorePatches[loadStorePosition] = patch;
+    }
+    else
+    {
+        void* func = NULL;
+        if (addrIsStatic)
+            func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size);
+
+        if (func)
+        {
+            if (flags & memop_Store)
+                MOV(W1, rdMapped);
+            QuickCallFunction(X2, (void (*)())func);
+
+            if (!(flags & memop_Store))
+            {
+                if (size == 32)
+                {
+                    if (staticAddress & 0x3)
+                        ROR_(rdMapped, W0, (staticAddress & 0x3) << 3);
+                    else
+                        MOV(rdMapped, W0);
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        SBFX(rdMapped, W0, 0, size);
+                    else
+                        UBFX(rdMapped, W0, 0, size);
+                }
+            }
+        }
+        else
+        {
+            if (Num == 0)
+            {
+                MOV(X1, RCPU);
+                if (flags & memop_Store)
+                {
+                    MOV(W2, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    }
+                }
+            }
+            else
+            {
+                if (flags & memop_Store)
+                {
+                    MOV(W1, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                    }
+                }
+                else
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                }
+            }
+
+            if (!(flags & memop_Store))
+            {
+                if (size == 32)
+                    MOV(rdMapped, W0);
+                else if (flags & memop_SignExtend)
+                    SBFX(rdMapped, W0, 0, size);
+                else
+                    UBFX(rdMapped, W0, 0, size);
+            }
+        }
+    }
+
+    if (CurInstr.Info.Branches())
+    {
+        if (size < 32)
+            printf("LDR size < 32 branching?\n");
+        Comp_JumpTo(rdMapped, Num == 0, false);
+    }
+}
+
+void Compiler::A_Comp_MemWB()
+{
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 25))
+        offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F);
+    else
+        offset = Op2(CurInstr.Instr & 0xFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags);
+}
+
+void Compiler::A_Comp_MemHD()
+{
+    bool load = CurInstr.Instr & (1 << 20);
+    bool signExtend;
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    int size;
+    
+    if (load)
+    {
+        signExtend = op >= 2;
+        size = op == 2 ? 8 : 16;
+    }
+    else
+    {
+        size = 16;
+        signExtend = false;
+    }
+
+    Op2 offset;
+    if (CurInstr.Instr & (1 << 22))
+        offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0));
+    else
+        offset = Op2(MapReg(CurInstr.A_Reg(0)));
+    
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    bool byte = op & 0x1;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), 
+        Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))),
+        size, flags);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
+        load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_LoadPCRel()
+{
+    u32 offset = ((CurInstr.Instr & 0xFF) << 2);
+    u32 addr = (R15 & ~0x2) + offset;
+
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    IrregularCycles = true;
+
+    int regsCount = regs.Count();
+
+    if (regsCount == 0)
+        return 0; // actually not the right behaviour TODO: fix me
+
+    if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
+    {
+        int flags = 0;
+        if (store)
+            flags |= memop_Store;
+        if (decrement)
+            flags |= memop_SubtractOffset;
+        Op2 offset = preinc ? Op2(4) : Op2(0);
+
+        Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+        return decrement ? -4 : 4;
+    }
+
+    if (store)
+        Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
+
+    int expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+    bool compileFastPath = Config::JIT_FastMemory
+        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
+
+    if (decrement)
+    {
+        SUB(W0, MapReg(rn), regsCount * 4);
+        ANDI2R(W0, W0, ~3);
+        preinc ^= true;
+    }
+    else
+    {
+        ANDI2R(W0, MapReg(rn), ~3);
+    }
+
+    LoadStorePatch patch;
+    if (compileFastPath)
+    {
+        ptrdiff_t fastPathStart = GetCodeOffset();
+        ptrdiff_t firstLoadStoreOffset;
+
+        bool firstLoadStore = true;
+
+        MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+        ADD(X1, X1, X0);
+
+        u32 offset = preinc ? 4 : 0;
+        BitSet16::Iterator it = regs.begin();
+
+        if (regsCount & 1)
+        {
+            int reg = *it;
+            it++;
+
+            ARM64Reg first = W3;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STR(INDEX_UNSIGNED, first, X1, offset);
+            else
+                LDR(INDEX_UNSIGNED, first, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+
+            offset += 4;
+        }
+
+        while (it != regs.end())
+        {
+            int reg = *it;
+            it++;
+            int nextReg = *it;
+            it++;
+
+            ARM64Reg first = W3, second = W4;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+            if (RegCache.LoadedRegs & (1 << nextReg))
+                second = MapReg(nextReg);
+            else if (store)
+                LoadReg(nextReg, second);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STP(INDEX_SIGNED, first, second, X1, offset);
+            else
+                LDP(INDEX_SIGNED, first, second, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+            if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store)
+                SaveReg(nextReg, second);
+
+            offset += 8;
+        }
+
+        patch.PatchSize = GetCodeOffset() - fastPathStart;
+        patch.PatchOffset = fastPathStart - firstLoadStoreOffset;
+        SwapCodeRegion();
+        patch.PatchFunc = GetRXPtr();
+
+        LoadStorePatches[firstLoadStoreOffset] = patch;
+
+        ABI_PushRegisters({30});
+    }
+
+    int i = 0;
+
+    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+    if (store)
+    {
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W5, RCPSR, 0, 5);
+
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && reg >= 8 && reg < 15)
+            {
+                if (RegCache.LoadedRegs & (1 << reg))
+                    MOV(W3, MapReg(reg));
+                else
+                    LoadReg(reg, W3);
+                MOVI2R(W1, reg - 8);
+                BL(ReadBanked);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3, second = W4;
+
+                if (RegCache.LoadedRegs & (1 << reg))
+                    first = MapReg(reg);
+                else
+                    LoadReg(reg, W3);
+
+                if (RegCache.LoadedRegs & (1 << *nextReg))
+                    second = MapReg(*nextReg);
+                else
+                    LoadReg(*nextReg, W4);
+
+                STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
+
+                i++;
+                it++;
+            }
+            else if (RegCache.LoadedRegs & (1 << reg))
+            {
+                STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+            }
+            else
+            {
+                LoadReg(reg, W3);
+                STR(INDEX_UNSIGNED, W3, SP, i * 8);
+            }
+            i++;
+            it++;
+        }
+    }
+
+    ADD(X1, SP, 0);
+    MOVI2R(W2, regsCount);
+
+    if (Num == 0)
+    {
+        MOV(X3, RCPU);
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break;
+        }
+    }
+    else
+    {
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break;
+        }
+    }
+
+    if (!store)
+    {
+        if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
+            UBFX(W5, RCPSR, 0, 5);
+
+        BitSet16::Iterator it = regs.begin();
+        while (it != regs.end())
+        {
+            BitSet16::Iterator nextReg = it;
+            nextReg++;
+
+            int reg = *it;
+
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                MOVI2R(W1, reg - 8);
+                BL(WriteBanked);
+                FixupBranch alreadyWritten = CBNZ(W4);
+                if (RegCache.LoadedRegs & (1 << reg))
+                    MOV(MapReg(reg), W3);
+                else
+                    SaveReg(reg, W3);
+                SetJumpTarget(alreadyWritten);
+            }
+            else if (!usermode && nextReg != regs.end())
+            {
+                ARM64Reg first = W3, second = W4;
+                
+                if (RegCache.LoadedRegs & (1 << reg))
+                    first = MapReg(reg);
+                if (RegCache.LoadedRegs & (1 << *nextReg))
+                    second = MapReg(*nextReg);
+
+                LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
+
+                if (first == W3)
+                    SaveReg(reg, W3);
+                if (second == W4)
+                    SaveReg(*nextReg, W4);
+
+                it++;
+                i++;
+            }
+            else if (RegCache.LoadedRegs & (1 << reg))
+            {
+                ARM64Reg mapped = MapReg(reg);
+                LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
+            }
+            else
+            {
+                LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+                SaveReg(reg, W3);
+            }
+
+            it++;
+            i++;
+        }
+    }
+    ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
+
+    if (compileFastPath)
+    {
+        ABI_PopRegisters({30});
+        RET();
+
+        FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr());
+        SwapCodeRegion();
+    }
+
+    if (!store && regs[15])
+    {
+        ARM64Reg mapped = MapReg(15);
+        Comp_JumpTo(mapped, Num == 0, usermode);
+    }
+
+    return regsCount * 4 * (decrement ? -1 : 1);
+}
+
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
+
+    ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
+
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
+    if (writeback)
+    {
+        if (offset > 0)
+            ADD(rn, rn, offset);
+        else
+            SUB(rn, rn, -offset);
+    }
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    ARM64Reg sp = MapReg(13);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
+
+    if (offset > 0)
+            ADD(sp, sp, offset);
+        else
+            SUB(sp, sp, -offset);
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    ARM64Reg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+    u32 regsCount = regs.Count();
+    
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+    {
+        if (offset > 0)
+            ADD(rb, rb, offset);
+        else
+            SUB(rb, rb, -offset);
+    }
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h
new file mode 100644
index 0000000..513c103
--- /dev/null
+++ b/src/ARMJIT_Compiler.h
@@ -0,0 +1,12 @@
+#if defined(__x86_64__)
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+#elif defined(__aarch64__)
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#else
+#error "The current target platform doesn't have a JIT backend"
+#endif
+
+namespace ARMJIT
+{
+extern Compiler* JITCompiler;
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
new file mode 100644
index 0000000..c87e1b3
--- /dev/null
+++ b/src/ARMJIT_Internal.h
@@ -0,0 +1,227 @@
+#ifndef ARMJIT_INTERNAL_H
+#define ARMJIT_INTERNAL_H
+
+#include "types.h"
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+
+// here lands everything which doesn't fit into ARMJIT.h
+// where it would be included by pretty much everything
+namespace ARMJIT
+{
+
+enum
+{
+	branch_IdleBranch = 1 << 0,
+	branch_FollowCondTaken = 1 << 1,
+	branch_FollowCondNotTaken = 1 << 2,
+	branch_StaticTarget = 1 << 3,
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+	u8 BranchFlags;
+	u8 SetFlags;
+    u32 Instr;
+	u32 Addr;
+
+	u8 DataCycles;
+    u16 CodeCycles;
+	u32 DataRegion;
+
+    ARMInstrInfo::Info Info;
+};
+
+/*
+	TinyVector
+		- because reinventing the wheel is the best!
+	
+	- meant to be used very often, with not so many elements
+	max 1 << 16 elements
+	- doesn't allocate while no elements are inserted
+	- not stl confirmant of course
+	- probably only works with POD types
+	- remove operations don't preserve order, but O(1)!
+*/
+template <typename T>
+struct __attribute__((packed)) TinyVector
+{
+	T* Data = NULL;
+	u16 Capacity = 0;
+	u16 Length = 0;
+
+	~TinyVector()
+	{
+		delete[] Data;
+	}
+
+	void MakeCapacity(u32 capacity)
+	{
+		assert(capacity <= UINT16_MAX);
+		assert(capacity > Capacity);
+		T* newMem = new T[capacity];
+		if (Data != NULL)
+			memcpy(newMem, Data, sizeof(T) * Length);
+
+		T* oldData = Data;
+		Data = newMem;
+		if (oldData != NULL)
+			delete[] oldData;
+		
+		Capacity = capacity;
+	}
+
+	void SetLength(u16 length)
+	{
+		if (Capacity < length)
+			MakeCapacity(length);
+		
+		Length = length;
+	}
+
+	void Clear()
+	{
+		Length = 0;
+	}
+
+	void Add(T element)
+	{
+		assert(Length + 1 <= UINT16_MAX);
+		if (Length + 1 > Capacity)
+			MakeCapacity(((Capacity + 4) * 3) / 2);
+		
+		Data[Length++] = element;
+	}
+
+	void Remove(int index)
+	{
+		assert(index >= 0 && index < Length);
+
+		Length--;
+		Data[index] = Data[Length];
+		/*for (int i = index; i < Length; i++)
+			Data[i] = Data[i + 1];*/
+	}
+
+	int Find(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+				return i;
+		}
+		return -1;
+	}
+
+	bool RemoveByValue(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+			{
+				Remove(i);
+				return true;
+			}
+		}
+		return false;
+	}
+
+	T& operator[](int index)
+	{
+		assert(index >= 0 && index < Length);
+		return Data[index];
+	}
+};
+
+class JitBlock
+{
+public:
+	JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals)
+	{
+		Num = num;
+		NumAddresses = numAddresses;
+		NumLiterals = numLiterals;
+		Data.SetLength(numAddresses * 2 + numLiterals);
+	}
+
+	u32 StartAddr;
+	u32 StartAddrLocal;
+	u32 InstrHash, LiteralHash;
+	u8 Num;
+	u16 NumAddresses;
+	u16 NumLiterals;
+
+	JitBlockEntry EntryPoint;
+
+	u32* AddressRanges()
+	{ return &Data[0]; }
+	u32* AddressMasks()
+	{ return &Data[NumAddresses]; }
+	u32* Literals()
+	{ return &Data[NumAddresses * 2]; }
+
+private:
+	TinyVector<u32> Data;
+};
+
+// size should be 16 bytes because I'm to lazy to use mul and whatnot
+struct __attribute__((packed)) AddressRange
+{
+	TinyVector<JitBlock*> Blocks;
+	u32 Code;
+};
+
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+extern InterpreterFunc InterpretARM[];
+extern InterpreterFunc InterpretTHUMB[];
+
+extern TinyVector<u32> InvalidLiterals;
+
+extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count];
+
+inline bool PageContainsCode(AddressRange* range)
+{
+	for (int i = 0; i < 8; i++)
+	{
+		if (range[i].Blocks.Length > 0)
+			return true;
+	}
+	return false;
+}
+
+u32 LocaliseCodeAddress(u32 num, u32 addr);
+
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
+
+template <typename T, int ConsoleType> T SlowRead9(u32 addr, ARMv5* cpu);
+template <typename T, int ConsoleType> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
+template <typename T, int ConsoleType> T SlowRead7(u32 addr);
+template <typename T, int ConsoleType> void SlowWrite7(u32 addr, T val);
+
+template <bool Write, int ConsoleType> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
+template <bool Write, int ConsoleType> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
new file mode 100644
index 0000000..0276c65
--- /dev/null
+++ b/src/ARMJIT_Memory.cpp
@@ -0,0 +1,1072 @@
+#if defined(__SWITCH__)
+#include "switch/compat_switch.h"
+#elif defined(_WIN32)
+#include <windows.h>
+#endif
+
+#include "ARMJIT_Memory.h"
+
+#include "ARMJIT_Internal.h"
+#include "ARMJIT_Compiler.h"
+
+#include "DSi.h"
+#include "GPU.h"
+#include "GPU3D.h"
+#include "Wifi.h"
+#include "NDSCart.h"
+#include "SPU.h"
+
+#include <malloc.h>
+
+/*
+	We're handling fastmem here.
+
+	Basically we're repurposing a big piece of virtual memory
+	and map the memory regions as they're structured on the DS
+	in it.
+
+	On most systems you have a single piece of main ram, 
+	maybe some video ram and faster cache RAM and that's about it.
+	Here we have not only a lot more different memory regions,
+	but also two address spaces. Not only that but they all have
+	mirrors (the worst case is 16kb SWRAM which is mirrored 1024x).
+
+	We handle this by only mapping those regions which are actually
+	used and by praying the games don't go wild.
+
+	Beware, this file is full of platform specific code.
+
+*/
+
+namespace ARMJIT_Memory
+{
+struct FaultDescription
+{
+	u32 EmulatedFaultAddr;
+	u64 FaultPC;
+};
+
+bool FaultHandler(FaultDescription* faultDesc, s32& offset);
+}
+
+#if defined(__SWITCH__)
+// with LTO the symbols seem to be not properly overriden
+// if they're somewhere else
+
+extern "C"
+{
+	
+void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
+
+extern char __start__;
+extern char __rodata_start;
+
+alignas(16) u8 __nx_exception_stack[0x8000];
+u64 __nx_exception_stack_size = 0x8000;
+
+void __libnx_exception_handler(ThreadExceptionDump* ctx)
+{
+	ARMJIT_Memory::FaultDescription desc;
+	desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w;
+	desc.FaultPC = ctx->pc.x;
+
+	u64 integerRegisters[33];
+	memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29);
+	integerRegisters[29] = ctx->fp.x;
+	integerRegisters[30] = ctx->lr.x;
+	integerRegisters[31] = ctx->sp.x;
+	integerRegisters[32] = ctx->pc.x;
+
+	s32 offset;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		integerRegisters[32] += offset;
+
+		ARM_RestoreContext(integerRegisters);	
+	}
+
+	if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start)
+	{
+		printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
+			ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x);
+	}
+	else
+	{
+		printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
+	}
+}
+
+}
+
+#elif defined(_WIN32)
+
+static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo)
+{
+	if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION)
+		return EXCEPTION_CONTINUE_SEARCH;
+
+	ARMJIT_Memory::FaultDescription desc;
+	desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx;
+	desc.FaultPC = exceptionInfo->ContextRecord->Rip;
+
+	s32 offset = 0;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		exceptionInfo->ContextRecord->Rip += offset;
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+
+	return EXCEPTION_CONTINUE_SEARCH;
+}
+
+#endif
+
+namespace ARMJIT_Memory
+{
+
+void* FastMem9Start, *FastMem7Start;
+
+#ifdef _WIN32
+inline u32 RoundUp(u32 size)
+{
+	return (size + 0xFFFF) & ~0xFFFF;
+}
+#else
+inline u32 RoundUp(u32 size)
+{
+	return size;
+}
+#endif
+
+const u32 MemBlockMainRAMOffset = 0;
+const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize);
+const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize);
+const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize);
+const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize);
+const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize);
+
+const u32 OffsetsPerRegion[memregions_Count] =
+{
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockDTCMOffset,
+	UINT32_MAX,
+	MemBlockMainRAMOffset,
+	MemBlockSWRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockARM7WRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockNWRAM_AOffset,
+	MemBlockNWRAM_BOffset,
+	MemBlockNWRAM_COffset
+};
+
+enum
+{
+	memstate_Unmapped,
+	memstate_MappedRW,
+	// on switch this is unmapped as well
+	memstate_MappedProtected,
+};
+
+u8 MappingStatus9[1 << (32-12)];
+u8 MappingStatus7[1 << (32-12)];
+
+#if defined(__SWITCH__)
+u8* MemoryBase;
+u8* MemoryBaseCodeMem;
+#elif defined(_WIN32)
+u8* MemoryBase;
+HANDLE MemoryFile;
+LPVOID ExceptionHandlerHandle;
+#endif
+
+bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), 
+		(u64)(MemoryBaseCodeMem + offset), size));
+	return R_SUCCEEDED(r);
+#elif defined(_WIN32)
+	bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst;
+	return r;
+#endif
+}
+
+bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(),
+		(u64)(MemoryBaseCodeMem + offset), size);
+	return R_SUCCEEDED(r);
+#else
+	return UnmapViewOfFile(dst);
+#endif
+}
+
+void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#if defined(_WIN32)
+	DWORD winProtection, oldProtection;
+	if (protection == 0)
+		winProtection = PAGE_NOACCESS;
+	else if (protection == 1)
+		winProtection = PAGE_READONLY;
+	else
+		winProtection = PAGE_READWRITE;
+	VirtualProtect(dst, size, winProtection, &oldProtection);
+#endif
+}
+
+struct Mapping
+{
+	u32 Addr;
+	u32 Size, LocalOffset;
+	u32 Num;
+
+	void Unmap(int region)
+	{
+		bool skipDTCM = Num == 0 && region != memregion_DTCM;
+		u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7;
+		u32 offset = 0;
+		while (offset < Size)
+		{
+			if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase)
+			{
+				offset += NDS::ARM9->DTCMSize;
+			}
+			else
+			{
+				u32 segmentOffset = offset;
+				u8 status = statuses[(Addr + offset) >> 12];
+				while (statuses[(Addr + offset) >> 12] == status
+					&& offset < Size
+					&& (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase))
+				{
+					assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped);
+					statuses[(Addr + offset) >> 12] = memstate_Unmapped;
+					offset += 0x1000;
+				}
+
+#ifdef __SWITCH__
+				if (status == memstate_MappedRW)
+				{
+					u32 segmentSize = offset - segmentOffset;
+					printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					assert(success);
+				}
+#endif
+			}
+		}
+#if defined(_WIN32)
+		UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size);
+#endif
+	}
+};
+ARMJIT::TinyVector<Mapping> Mappings[memregions_Count];
+
+void SetCodeProtection(int region, u32 offset, bool protect)
+{
+	offset &= ~0xFFF;
+	printf("set code protection %d %x %d\n", region, offset, protect);
+
+	for (int i = 0; i < Mappings[region].Length; i++)
+	{
+		Mapping& mapping = Mappings[region][i];
+
+		u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset);
+		if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size)
+			continue;
+		if (mapping.Num == 0
+			&& region != memregion_DTCM 
+			&& effectiveAddr >= NDS::ARM9->DTCMBase
+			&& effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+			continue;
+
+		u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7);
+
+		printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]);
+		assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected));
+		states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW;
+
+#if defined(__SWITCH__)
+		bool success;
+		if (protect)
+			success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		else
+			success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		assert(success);
+#elif defined(_WIN32)
+		SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2);
+#endif
+	}
+}
+
+void RemapDTCM(u32 newBase, u32 newSize)
+{
+	// this first part could be made more efficient
+	// by unmapping DTCM first and then map the holes
+	u32 oldDTCMBase = NDS::ARM9->DTCMBase;
+	u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize;
+
+	u32 newEnd = newBase + newSize;
+
+	printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd);
+	// unmap all regions containing the old or the current DTCM mapping
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		if (region == memregion_DTCM)
+			continue;
+
+		for (int i = 0; i < Mappings[region].Length;)
+		{
+			Mapping& mapping = Mappings[region][i];
+
+			u32 start = mapping.Addr;
+			u32 end = mapping.Addr + mapping.Size;
+
+			printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset);
+
+			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start);
+			bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start);
+
+			if (mapping.Num == 0 && (oldOverlap || newOverlap))
+			{
+				mapping.Unmap(region);
+				Mappings[region].Remove(i);
+			}
+			else
+			{
+				i++;
+			}
+		}
+	}
+
+	for (int i = 0; i < Mappings[memregion_DTCM].Length; i++)
+	{
+		Mappings[memregion_DTCM][i].Unmap(memregion_DTCM);
+	}
+	Mappings[memregion_DTCM].Clear();
+}
+
+void RemapNWRAM(int num)
+{
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;)
+	{
+		Mapping& mapping = Mappings[memregion_SharedWRAM][i];
+		if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size
+			|| DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr))
+		{
+			mapping.Unmap(memregion_SharedWRAM);
+			Mappings[memregion_SharedWRAM].Remove(i);
+		}
+		else
+		{
+			i++;
+		}
+	}
+	for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++)
+	{
+		Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num);
+	}
+	Mappings[memregion_NewSharedWRAM_A + num].Clear();
+}
+
+void RemapSWRAM()
+{
+	printf("remapping SWRAM\n");
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++)
+	{
+		Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM);
+	}
+	Mappings[memregion_SharedWRAM].Clear();
+	for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++)
+	{
+		Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7);
+	}
+	Mappings[memregion_WRAM7].Clear();
+	for (int j = 0; j < 3; j++)
+	{
+		for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++)
+		{
+			Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j);	
+		}
+		Mappings[memregion_NewSharedWRAM_A + j].Clear();
+	}
+}
+
+bool MapAtAddress(u32 addr)
+{
+	u32 num = NDS::CurCPU;
+
+	int region = num == 0
+		? ClassifyAddress9(addr)
+		: ClassifyAddress7(addr);
+
+	if (!IsFastmemCompatible(region))
+		return false;
+
+	return false;
+
+	u32 mirrorStart, mirrorSize, memoryOffset;
+	bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize);
+	if (!isMapped)
+		return false;
+
+	u8* states = num == 0 ? MappingStatus9 : MappingStatus7;
+	printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num);
+	bool isExecutable = ARMJIT::CodeMemRegions[region];
+
+#if defined(_WIN32)
+	bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize);
+	assert(succeded);
+#endif
+
+	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512;
+
+	// this overcomplicated piece of code basically just finds whole pieces of code memory
+	// which can be mapped
+	u32 offset = 0;	
+	bool skipDTCM = num == 0 && region != memregion_DTCM;
+	while (offset < mirrorSize)
+	{
+		if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase)
+		{
+			SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0);
+			offset += NDS::ARM9->DTCMSize;
+		}
+		else
+		{
+			u32 sectionOffset = offset;
+			bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]);
+			while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode)
+				&& offset < mirrorSize
+				&& (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase))
+			{
+				assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped);
+				states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW;
+				offset += 0x1000;
+			}
+
+			u32 sectionSize = offset - sectionOffset;
+
+#if defined(__SWITCH__)
+			if (!hasCode)
+			{
+				printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]);
+				bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize);
+				assert(succeded);
+			}
+#elif defined(_WIN32)
+			if (hasCode)
+			{
+				SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1);
+			}
+#endif
+		}
+	}
+
+	assert(num == 0 || num == 1);
+	Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num};
+	Mappings[region].Add(mapping);
+
+	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1);
+
+	return true;
+}
+
+bool FaultHandler(FaultDescription* faultDesc, s32& offset)
+{
+	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC))
+	{
+		bool rewriteToSlowPath = true;
+
+		u32 addr = faultDesc->EmulatedFaultAddr;
+
+		if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped)
+			rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr);
+
+		if (rewriteToSlowPath)
+		{
+			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC);
+		}
+		return true;
+	}
+	return false;
+}
+
+void Init()
+{
+#if defined(__SWITCH__)
+    MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize);
+	MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        (u64)MemoryBase, MemoryTotalSize));
+    assert(succeded);
+	succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        MemoryTotalSize, Perm_Rw));
+	assert(succeded);
+
+	// 8 GB of address space, just don't ask...
+	FastMem9Start = virtmemReserve(0x100000000);
+	assert(FastMem9Start);
+	FastMem7Start = virtmemReserve(0x100000000);
+	assert(FastMem7Start);
+
+	u8* basePtr = MemoryBaseCodeMem;
+#elif defined(_WIN32)
+	ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler);
+
+	MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL);
+
+	MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE);
+
+	FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+	FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+
+	// only free them after they have all been reserved
+	// so they can't overlap
+	VirtualFree(MemoryBase, 0, MEM_RELEASE);
+	VirtualFree(FastMem9Start, 0, MEM_RELEASE);
+	VirtualFree(FastMem7Start, 0, MEM_RELEASE);
+
+	MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase);
+
+	u8* basePtr = MemoryBase;
+#endif
+	NDS::MainRAM = basePtr + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset;
+	DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset;
+	DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset;
+	DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset;
+}
+
+void DeInit()
+{
+#if defined(__SWITCH__)
+	virtmemFree(FastMem9Start, 0x100000000);
+	virtmemFree(FastMem7Start, 0x100000000);
+
+    svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize);
+	virtmemFree(MemoryBaseCodeMem, MemoryTotalSize);
+    free(MemoryBase);
+#elif defined(_WIN32)
+	assert(UnmapViewOfFile(MemoryBase));
+	CloseHandle(MemoryFile);
+
+	RemoveVectoredExceptionHandler(ExceptionHandlerHandle);
+#endif
+}
+
+void Reset()
+{
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		for (int i = 0; i < Mappings[region].Length; i++)
+			Mappings[region][i].Unmap(region);
+		Mappings[region].Clear();
+	}
+
+	for (int i = 0; i < sizeof(MappingStatus9); i++)
+	{
+		assert(MappingStatus9[i] == memstate_Unmapped);
+		assert(MappingStatus7[i] == memstate_Unmapped);
+	}
+
+	printf("done resetting jit mem\n");
+}
+
+bool IsFastmemCompatible(int region)
+{
+#ifdef _WIN32
+	/*
+		TODO: with some hacks, the smaller shared WRAM regions
+		could be mapped in some occaisons as well
+	*/
+	if (region == memregion_DTCM 
+		|| region == memregion_SharedWRAM
+		|| region == memregion_NewSharedWRAM_B
+		|| region == memregion_NewSharedWRAM_C)
+		return false;
+#endif
+	return OffsetsPerRegion[region] != UINT32_MAX;
+}
+
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize)
+{
+	memoryOffset = 0;
+	switch (region)
+	{
+	case memregion_ITCM:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~(ITCMPhysicalSize - 1);
+			mirrorSize = ITCMPhysicalSize;
+			return true;
+		}
+		return false;
+	case memregion_MainRAM:
+		mirrorStart = addr & ~NDS::MainRAMMask;
+		mirrorSize = NDS::MainRAMMask + 1;
+		return true;
+	case memregion_BIOS9:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~0xFFF;
+			mirrorSize = 0x1000;
+			return true;
+		}
+		return false;
+	case memregion_BIOS7:
+		if (num == 1)
+		{
+			mirrorStart = 0;
+			mirrorSize = 0x4000;
+			return true;
+		}
+		return false;
+	case memregion_SharedWRAM:
+		if (num == 0 && NDS::SWRAM_ARM9.Mem)
+		{
+			mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask;
+			mirrorSize = NDS::SWRAM_ARM9.Mask + 1;
+			memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM;
+			return true;
+		}
+		else if (num == 1 && NDS::SWRAM_ARM7.Mem)
+		{
+			mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask;
+			mirrorSize = NDS::SWRAM_ARM7.Mask + 1;
+			memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM;
+			return true;
+		}
+		return false;
+	case memregion_WRAM7:
+		if (num == 1)
+		{
+			mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1);
+			mirrorSize = NDS::ARM7WRAMSize;
+			return true;
+		}
+		return false;
+	case memregion_VRAM:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~0xFFFFF;
+			mirrorSize = 0x100000;
+		}
+		return false;
+	case memregion_VWRAM:
+		if (num == 1)
+		{
+			mirrorStart = addr & ~0x3FFFF;
+			mirrorSize = 0x40000;
+			return true;
+		}
+		return false;
+	case memregion_NewSharedWRAM_A:
+		{
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
+			{
+				memoryOffset = ptr - DSi::NWRAM_A;
+				mirrorStart = addr & ~0xFFFF;
+				mirrorSize = 0x10000;
+				return true;
+			}
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
+			{
+				memoryOffset = ptr - DSi::NWRAM_B;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
+			}
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+			{
+				memoryOffset = ptr - DSi::NWRAM_C;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
+			}
+			return false; // zero filled memory
+		}
+	case memregion_BIOS9DSi:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000;
+			return true;
+		}
+		return false;
+	case memregion_BIOS7DSi:
+		if (num == 1)
+		{
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000;
+			return true;
+		}
+		return false;
+	default:
+		assert(false && "For the time being this should only be used for code");
+		return false;
+	}
+}
+
+u32 LocaliseAddress(int region, u32 num, u32 addr)
+{
+	switch (region)
+	{
+	case memregion_ITCM:
+		return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27);
+	case memregion_MainRAM:
+		return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27);
+	case memregion_BIOS9:
+		return (addr & 0xFFF) | (memregion_BIOS9 << 27);
+	case memregion_BIOS7:
+		return (addr & 0x3FFF) | (memregion_BIOS7 << 27);
+	case memregion_SharedWRAM:
+		if (num == 0)
+			return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+		else
+			return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+	case memregion_WRAM7:
+		return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27);
+	case memregion_VRAM:
+		// TODO: take mapping properly into account
+		return (addr & 0xFFFFF) | (memregion_VRAM << 27);
+	case memregion_VWRAM:
+		// same here
+		return (addr & 0x3FFFF) | (memregion_VWRAM << 27);
+	case memregion_NewSharedWRAM_A:
+		{
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27);
+			else
+				return memregion_Other << 27; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_BIOS9DSi:
+	case memregion_BIOS7DSi:
+		return (addr & 0xFFFF) | (region << 27);
+	default:
+		assert(false && "This should only be needed for regions which can contain code");
+		return memregion_Other << 27;
+	}
+}
+
+int ClassifyAddress9(u32 addr)
+{
+	if (addr < NDS::ARM9->ITCMSize)
+	{
+		return memregion_ITCM;
+	}
+	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+	{
+		return memregion_DTCM;
+	}
+	else 
+	{
+		if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1)))
+		{
+			if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0)))
+				return memregion_Other;
+
+			return memregion_BIOS9DSi;
+		}
+		else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		{
+			return memregion_BIOS9;
+		}
+
+		switch (addr & 0xFF000000)
+		{
+		case 0x02000000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
+			if (NDS::SWRAM_ARM9.Mem)
+				return memregion_SharedWRAM;
+			return memregion_Other;
+		case 0x04000000:
+			return memregion_IO9;
+		case 0x06000000:
+			return memregion_VRAM;
+		default:
+			return memregion_Other;
+		}
+	}
+}
+
+int ClassifyAddress7(u32 addr)
+{
+	if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9)))
+    {
+        if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8))
+            return memregion_Other;
+
+        return memregion_BIOS7DSi;
+    }
+	else if (addr < 0x00004000)
+	{
+		return memregion_BIOS7;
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x02000000:
+		case 0x02800000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
+			if (NDS::SWRAM_ARM7.Mem)
+				return memregion_SharedWRAM;
+			return memregion_WRAM7;
+		case 0x03800000:
+			return memregion_WRAM7;
+		case 0x04000000:
+			return memregion_IO7;
+		case 0x04800000:
+			return memregion_Wifi;
+		case 0x06000000:
+		case 0x06800000:
+			return memregion_VWRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+void WifiWrite32(u32 addr, u32 val)
+{
+	Wifi::Write(addr, val & 0xFFFF);
+	Wifi::Write(addr + 2, val >> 16);
+}
+
+u32 WifiRead32(u32 addr)
+{
+	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
+}
+
+template <typename T>
+void VRAMWrite(u32 addr, T val)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
+	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
+	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
+	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
+	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
+	}
+}
+template <typename T>
+T VRAMRead(u32 addr)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
+	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
+	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
+	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
+	default: return GPU::ReadVRAM_LCDC<T>(addr);
+	}
+}
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x04000000:
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			if (NDS::ConsoleType == 0)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM9IORead8;
+				case 9: return (void*)NDS::ARM9IOWrite8;
+				case 16: return (void*)NDS::ARM9IORead16;
+				case 17: return (void*)NDS::ARM9IOWrite16;
+				case 32: return (void*)NDS::ARM9IORead32;
+				case 33: return (void*)NDS::ARM9IOWrite32;
+				}
+			}
+			else
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM9IORead8;
+				case 9: return (void*)DSi::ARM9IOWrite8;
+				case 16: return (void*)DSi::ARM9IORead16;
+				case 17: return (void*)DSi::ARM9IOWrite16;
+				case 32: return (void*)DSi::ARM9IORead32;
+				case 33: return (void*)DSi::ARM9IOWrite32;
+				}
+			}
+			break;
+		case 0x06000000:
+			switch (size | store)
+			{
+			case 8: return (void*)VRAMRead<u8>;		
+			case 9: return NULL;
+			case 16: return (void*)VRAMRead<u16>;
+			case 17: return (void*)VRAMWrite<u16>;
+			case 32: return (void*)VRAMRead<u32>;
+			case 33: return (void*)VRAMWrite<u32>;
+			}
+			break;
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			if (NDS::ConsoleType == 0)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM7IORead8;
+				case 9: return (void*)NDS::ARM7IOWrite8;		
+				case 16: return (void*)NDS::ARM7IORead16;
+				case 17: return (void*)NDS::ARM7IOWrite16;
+				case 32: return (void*)NDS::ARM7IORead32;
+				case 33: return (void*)NDS::ARM7IOWrite32;
+				}
+			}
+			else
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM7IORead8;
+				case 9: return (void*)DSi::ARM7IOWrite8;		
+				case 16: return (void*)DSi::ARM7IORead16;
+				case 17: return (void*)DSi::ARM7IOWrite16;
+				case 32: return (void*)DSi::ARM7IORead32;
+				case 33: return (void*)DSi::ARM7IOWrite32;
+				}
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size >= 16)
+			{
+				switch (size | store)
+				{
+				case 16: return (void*)Wifi::Read;
+				case 17: return (void*)Wifi::Write;
+				case 32: return (void*)WifiRead32;
+				case 33: return (void*)WifiWrite32;
+				}
+			}
+			break;
+		case 0x06000000:
+		case 0x06800000:
+			switch (size | store)
+			{
+			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
+			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
+			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
+			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
+			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
+			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
+			}
+		}
+	}
+	return NULL;
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
new file mode 100644
index 0000000..123e18e
--- /dev/null
+++ b/src/ARMJIT_Memory.h
@@ -0,0 +1,63 @@
+#ifndef ARMJIT_MEMORY
+#define ARMJIT_MEMORY
+
+#include "types.h"
+
+#include "ARM.h"
+
+namespace ARMJIT_Memory
+{
+
+extern void* FastMem9Start;
+extern void* FastMem7Start;
+
+void Init();
+void DeInit();
+
+void Reset();
+
+enum
+{
+	memregion_Other = 0,
+	memregion_ITCM,
+	memregion_DTCM,
+	memregion_BIOS9,
+	memregion_MainRAM,
+	memregion_SharedWRAM,
+	memregion_IO9,
+	memregion_VRAM,
+	memregion_BIOS7,
+	memregion_WRAM7,
+	memregion_IO7,
+	memregion_Wifi,
+	memregion_VWRAM,
+
+	// DSi
+	memregion_BIOS9DSi,
+	memregion_BIOS7DSi,
+	memregion_NewSharedWRAM_A,
+	memregion_NewSharedWRAM_B,
+	memregion_NewSharedWRAM_C,
+
+	memregions_Count
+};
+
+int ClassifyAddress9(u32 addr);
+int ClassifyAddress7(u32 addr);
+
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize);
+u32 LocaliseAddress(int region, u32 num, u32 addr);
+
+bool IsFastmemCompatible(int region);
+
+void RemapDTCM(u32 newBase, u32 newSize);
+void RemapSWRAM();
+void RemapNWRAM(int num);
+
+void SetCodeProtection(int region, u32 offset, bool protect);
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
new file mode 100644
index 0000000..0547c84
--- /dev/null
+++ b/src/ARMJIT_RegisterCache.h
@@ -0,0 +1,199 @@
+#ifndef ARMJIT_REGCACHE_H
+#define ARMJIT_REGCACHE_H
+
+#include "ARMJIT.h"
+
+// TODO: replace this in the future
+#include "dolphin/BitSet.h"
+
+#include <assert.h>
+
+namespace ARMJIT
+{
+
+template <typename T, typename Reg>
+class RegisterCache
+{
+public:
+    RegisterCache()
+    {}
+
+	RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount, bool pcAllocatableAsSrc = false)
+		: Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount)
+    {
+        for (int i = 0; i < 16; i++)
+            Mapping[i] = (Reg)-1;
+        
+        PCAllocatableAsSrc = ~(pcAllocatableAsSrc
+            ? 0
+            : (1 << 15));
+    }
+
+    void UnloadRegister(int reg)
+    {
+        assert(Mapping[reg] != -1);
+
+        if (DirtyRegs & (1 << reg))
+            Compiler->SaveReg(reg, Mapping[reg]);
+
+        DirtyRegs &= ~(1 << reg);
+        LoadedRegs &= ~(1 << reg);
+        NativeRegsUsed &= ~(1 << (int)Mapping[reg]);
+        Mapping[reg] = (Reg)-1;
+    }
+
+    void LoadRegister(int reg, bool loadValue)
+    {
+        assert(Mapping[reg] == -1);
+        for (int i = 0; i < NativeRegsAvailable; i++)
+        {
+            Reg nativeReg = NativeRegAllocOrder[i];
+            if (!(NativeRegsUsed & (1 << nativeReg)))
+            {
+                Mapping[reg] = nativeReg;
+                NativeRegsUsed |= 1 << (int)nativeReg;
+                LoadedRegs |= 1 << reg;
+
+                if (loadValue)
+                    Compiler->LoadReg(reg, nativeReg);
+
+                return;
+            }
+        }
+
+        assert("Welp!");
+    }
+
+    void PutLiteral(int reg, u32 val)
+    {
+        LiteralsLoaded |= (1 << reg);
+        LiteralValues[reg] = val;
+    }
+
+    void UnloadLiteral(int reg)
+    {
+        LiteralsLoaded &= ~(1 << reg);
+    }
+
+    bool IsLiteral(int reg)
+    {
+        return LiteralsLoaded & (1 << reg);
+    }
+
+    void PrepareExit()
+    {
+        BitSet16 dirtyRegs(DirtyRegs);
+        for (int reg : dirtyRegs)
+            Compiler->SaveReg(reg, Mapping[reg]);
+    }
+
+    void Flush()
+    {
+        BitSet16 loadedSet(LoadedRegs);
+        for (int reg : loadedSet)
+            UnloadRegister(reg);
+        LiteralsLoaded = 0;
+    }
+
+	void Prepare(bool thumb, int i)
+    {
+        FetchedInstr instr = Instrs[i];
+
+        if (LoadedRegs & (1 << 15))
+            UnloadRegister(15);
+
+        BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
+        for (int reg : invalidedLiterals)
+            UnloadLiteral(reg);
+
+        u16 futureNeeded = 0;
+        int ranking[16];
+        for (int j = 0; j < 16; j++)
+            ranking[j] = 0;
+        for (int j = i; j < InstrsCount; j++)
+        {
+            BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
+            futureNeeded |= regsNeeded.m_val;
+            regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
+            for (int reg : regsNeeded)
+                ranking[reg]++;
+        }
+
+        // we'll unload all registers which are never used again
+        BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded);
+        for (int reg : neverNeededAgain)
+            UnloadRegister(reg);
+
+        u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
+        if (needToBeLoaded != BitSet16(0))
+        {
+            int neededCount = needToBeLoaded.Count();
+            BitSet16 loadedSet(LoadedRegs);
+            while (loadedSet.Count() + neededCount > NativeRegsAvailable)
+            {
+                int leastReg = -1;
+                int rank = 1000;
+                for (int reg : loadedSet)
+                {
+                    if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank)
+                    {
+                        leastReg = reg;
+                        rank = ranking[reg];
+                    }
+                }
+
+                assert(leastReg != -1);
+                UnloadRegister(leastReg);
+
+                loadedSet.m_val = LoadedRegs;
+            }
+
+            // we don't need to load a value which is always going to be overwritten
+            BitSet16 needValueLoaded(needToBeLoaded);
+            if (thumb || instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(instr.Info.SrcRegs);
+            for (int reg : needToBeLoaded)
+                LoadRegister(reg, needValueLoaded[reg]);
+        }
+        {
+            BitSet16 loadedSet(LoadedRegs);
+            BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
+            if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
+            {
+                int left = NativeRegsAvailable - loadedSet.Count();
+                for (int reg : loadRegs)
+                {
+                    if (left-- == 0)
+                        break;
+
+                    LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
+                }
+            }
+        }
+
+        DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15);
+    }
+
+	static const Reg NativeRegAllocOrder[];
+	static const int NativeRegsAvailable;
+
+	Reg Mapping[16];
+    u32 LiteralValues[16];
+
+    u16 LiteralsLoaded = 0;
+	u32 NativeRegsUsed = 0;
+	u16 LoadedRegs = 0;
+	u16 DirtyRegs = 0;
+
+    u16 PCAllocatableAsSrc = 0;
+
+	T* Compiler;
+
+	FetchedInstr* Instrs;
+	int InstrsCount;
+};
+
+}
+
+#endif
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..43b94b6
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -0,0 +1,768 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+// uses RSCRATCH3
+void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&),
+    OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (rd == rn && !(opFlags & opInvertOp2))
+        (this->*op)(32, rd, op2);
+    else if (opFlags & opSymmetric && op2 == R(RSCRATCH))
+    {
+        if (opFlags & opInvertOp2)
+            NOT(32, op2);
+        (this->*op)(32, op2, rn);
+        MOV(32, rd, op2);
+    }
+    else
+    {
+        if (opFlags & opInvertOp2)
+        {
+            if (op2 != R(RSCRATCH))
+            {
+                MOV(32, R(RSCRATCH), op2);
+                op2 = R(RSCRATCH);
+            }
+            NOT(32, op2);
+        }
+        MOV(32, R(RSCRATCH3), rn);
+        (this->*op)(32, R(RSCRATCH3), op2);
+        MOV(32, rd, R(RSCRATCH3));
+    }
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags)
+{
+    if (opFlags & opSyncCarry)
+    {
+        BT(32, R(RCPSR), Imm8(29));
+        if (opFlags & opInvertCarry)
+            CMC();
+    }
+
+    if (op2 != R(RSCRATCH))
+    {
+        MOV(32, R(RSCRATCH), op2);
+        op2 = R(RSCRATCH);
+    }
+    (this->*op)(32, op2, rn);
+    MOV(32, rd, op2);
+
+    if (opFlags & opSetsFlags)
+        Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed);
+}
+
+void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed)
+{
+    switch (op)
+    {
+    case 0: // TST
+        if (rn.IsImm())
+        {
+            MOV(32, R(RSCRATCH3), rn);
+            rn = R(RSCRATCH3);
+        }
+        TEST(32, rn, op2);
+    break;
+    case 1: // TEQ
+        MOV(32, R(RSCRATCH3), rn);
+        XOR(32, R(RSCRATCH3), op2);
+    break;
+    case 2: // CMP
+        if (rn.IsImm())
+        {
+            MOV(32, R(RSCRATCH3), rn);
+            rn = R(RSCRATCH3);
+        }
+        CMP(32, rn, op2);
+    break;
+    case 3: // CMN
+        MOV(32, R(RSCRATCH3), rn);
+        ADD(32, R(RSCRATCH3), op2);
+    break;
+    }
+
+    Comp_RetriveFlags(op == 2, op >= 2, carryUsed);
+}
+
+// also calculates cycles
+OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed)
+{
+    if (CurInstr.Instr & (1 << 25))
+    {
+        Comp_AddCycles_C();
+        carryUsed = false;
+        return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));
+    }
+    else
+    {
+        S = S && (CurInstr.SetFlags & 0x2);
+
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        if (CurInstr.Instr & (1 << 4))
+        {
+            Comp_AddCycles_CI(1);
+            OpArg rm = MapReg(CurInstr.A_Reg(0));
+            if (rm.IsImm() && CurInstr.A_Reg(0) == 15)
+                rm = Imm32(rm.Imm32() + 4);
+            return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed);
+        }
+        else
+        {
+            Comp_AddCycles_C();
+            return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F,
+                    MapReg(CurInstr.A_Reg(0)), S, carryUsed);
+        }
+    }
+}
+
+void Compiler::A_Comp_CmpOp()
+{
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed);
+
+    Comp_CmpOp(op - 0x8, rn, op2, carryUsed);
+}
+
+void Compiler::A_Comp_Arith()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    bool carryUsed;
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed);
+
+    u32 sFlag = S ? opSetsFlags : 0;
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        break;
+    case 0x1: // EOR
+        Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        break;
+    case 0x2: // SUB
+        Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        break;
+    case 0x3: // RSB
+        if (op2.IsZero())
+        {
+            if (rd != rn)
+                MOV(32, rd, rn);
+            NEG(32, rd);
+            if (S)
+                Comp_RetriveFlags(true, true, false);
+        }
+        else
+            Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry);
+        break;
+    case 0x4: // ADD
+        Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV);
+        break;
+    case 0x5: // ADC
+        Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry);
+        break;
+    case 0x6: // SBC
+        Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry);
+        break;
+    case 0x7: // RSC
+        Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry);
+        break;
+    case 0xC: // ORR
+        Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag);
+        break;
+    case 0xE: // BIC
+        Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2);
+        break;
+    default:
+        assert("unimplemented");
+    }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
+}
+
+void Compiler::A_Comp_MovOp()
+{
+    bool carryUsed;
+    bool S = CurInstr.Instr & (1 << 20);
+    OpArg op2 = A_Comp_GetALUOp2(S, carryUsed);
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (rd != op2)
+        MOV(32, rd, op2);
+
+    if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
+    {
+        NOT(32, rd);
+        if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
+    }
+    else if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
+
+    if (S)
+    {
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, carryUsed);
+    }
+
+    if (CurInstr.A_Reg(12) == 15)
+        Comp_JumpTo(rd.GetSimpleReg(), S);
+}
+
+void Compiler::A_Comp_CLZ()
+{
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+
+    MOV(32, R(RSCRATCH), Imm32(32));
+    TEST(32, rm, rm);
+    FixupBranch skipZero = J_CC(CC_Z);
+    BSR(32, RSCRATCH, rm);
+    XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH
+    SetJumpTarget(skipZero);
+    MOV(32, rd, R(RSCRATCH));
+}
+
+void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn)
+{
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        BSR(32, RSCRATCH2, R(RSCRATCH3));
+        NOT(32, R(RSCRATCH3));
+        BSR(32, RSCRATCH, R(RSCRATCH3));
+        CMP(32, R(RSCRATCH2), R(RSCRATCH));
+        CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1);
+    }
+
+    static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!");
+    MOV(32, R(RSCRATCH), rm);
+    if (add)
+    {
+        IMUL(32, RSCRATCH, rs);
+        LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg()));
+        if (S && FlagsNZRequired())
+            TEST(32, rd, rd);
+    }
+    else
+    {
+        IMUL(32, RSCRATCH, rs);
+        MOV(32, rd, R(RSCRATCH));
+        if (S && FlagsNZRequired())
+        TEST(32, R(RSCRATCH), R(RSCRATCH));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+}
+
+void Compiler::A_Comp_MUL_MLA()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn;
+    if (add)
+        rn = MapReg(CurInstr.A_Reg(12));
+
+    Comp_MulOp(S, add, rd, rm, rs, rn);
+}
+
+void Compiler::A_Comp_Mul_Long()
+{
+    bool S = CurInstr.Instr & (1 << 20);
+    bool add = CurInstr.Instr & (1 << 21);
+    bool sign = CurInstr.Instr & (1 << 22);
+    OpArg rd = MapReg(CurInstr.A_Reg(16));
+    OpArg rm = MapReg(CurInstr.A_Reg(0));
+    OpArg rs = MapReg(CurInstr.A_Reg(8));
+    OpArg rn = MapReg(CurInstr.A_Reg(12));
+
+    if (Num == 0)
+        Comp_AddCycles_CI(S ? 3 : 1);
+    else
+    {
+        XOR(32, R(RSCRATCH), R(RSCRATCH));
+        MOV(32, R(RSCRATCH3), rs);
+        TEST(32, R(RSCRATCH3), R(RSCRATCH3));
+        FixupBranch zeroBSR = J_CC(CC_Z);
+        if (sign)
+        {
+            BSR(32, RSCRATCH2, R(RSCRATCH3));
+            NOT(32, R(RSCRATCH3));
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+            CMP(32, R(RSCRATCH2), R(RSCRATCH));
+            CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L);
+        }
+        else
+        {
+            BSR(32, RSCRATCH, R(RSCRATCH3));
+        }
+        
+        SHR(32, R(RSCRATCH), Imm8(3));
+        SetJumpTarget(zeroBSR); // fortunately that's even right
+        Comp_AddCycles_CI(RSCRATCH, 2);
+    }
+
+    if (sign)
+    {
+        MOVSX(64, 32, RSCRATCH2, rm);
+        MOVSX(64, 32, RSCRATCH3, rs);
+    }
+    else
+    {
+        MOV(32, R(RSCRATCH2), rm);
+        MOV(32, R(RSCRATCH3), rs);
+    }
+    if (add)
+    {
+        MOV(32, R(RSCRATCH), rd);
+        SHL(64, R(RSCRATCH), Imm8(32));
+        OR(64, R(RSCRATCH), rn);
+
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH));
+    }
+    else
+    {
+        IMUL(64, RSCRATCH2, R(RSCRATCH3));
+        if (S && FlagsNZRequired())
+            TEST(64, R(RSCRATCH2), R(RSCRATCH2));
+    }
+
+    if (S)
+        Comp_RetriveFlags(false, false, false);
+
+    MOV(32, rn, R(RSCRATCH2));
+    SHR(64, R(RSCRATCH2), Imm8(32));
+    MOV(32, rd, R(RSCRATCH2));
+}
+
+void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed)
+{
+    if (CurInstr.SetFlags == 0)
+        return;
+    if (retriveCV && !(CurInstr.SetFlags & 0x3))
+        retriveCV = false;
+
+    bool carryOnly = !retriveCV && carryUsed;
+    if (carryOnly && !(CurInstr.SetFlags & 0x2))
+    {
+        carryUsed = false;
+        carryOnly = false;
+    }
+
+    CPSRDirty = true;
+
+    if (retriveCV)
+    {
+        SETcc(CC_O, R(RSCRATCH));
+        SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3));
+        LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0));
+    }
+
+    if (FlagsNZRequired())
+    {
+        SETcc(CC_S, R(RSCRATCH));
+        SETcc(CC_Z, R(RSCRATCH3));
+        LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0));
+        int shiftAmount = 30;
+        if (retriveCV || carryUsed)
+        {
+            LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0));
+            shiftAmount = carryOnly ? 29 : 28;
+        }
+        SHL(32, R(RSCRATCH), Imm8(shiftAmount));
+
+        AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH));
+    }
+    else if (carryUsed || retriveCV)
+    {
+        SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28));
+        AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28)));
+        OR(32, R(RCPSR), R(RSCRATCH2));
+    }
+}
+
+// always uses RSCRATCH, RSCRATCH2 only if S == true
+OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = S;
+
+    if (S)
+    {
+        XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+        TEST(32, R(RCPSR), Imm32(1 << 29));
+        SETcc(CC_NZ, R(RSCRATCH2));
+    }
+
+    MOV(32, R(RSCRATCH), rm);
+    static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3");
+    MOV(32, R(ECX), rs);
+    AND(32, R(ECX), Imm32(0xFF));
+
+    FixupBranch zero = J_CC(CC_Z);
+    if (op < 3)
+    {
+        void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL;
+        if (op == 0)
+            shiftOp = &Compiler::SHL;
+        else if (op == 1)
+            shiftOp = &Compiler::SHR;
+        else if (op == 2)
+            shiftOp = &Compiler::SAR;
+
+        CMP(32, R(ECX), Imm8(32));
+        FixupBranch lt32 = J_CC(CC_L);
+        FixupBranch done1;
+        if (op < 2)
+        {
+            FixupBranch eq32 = J_CC(CC_E);
+            XOR(32, R(RSCRATCH), R(RSCRATCH));
+            if (S)
+                XOR(32, R(RSCRATCH2), R(RSCRATCH2));
+            done1 = J();
+            SetJumpTarget(eq32);
+        }
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(31));
+        (this->*shiftOp)(32, R(RSCRATCH), Imm8(1));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        FixupBranch done2 = J();
+
+        SetJumpTarget(lt32);
+        (this->*shiftOp)(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+
+        if (op < 2)
+            SetJumpTarget(done1);
+        SetJumpTarget(done2);
+
+    }
+    else if (op == 3)
+    {
+        if (S)
+            BT(32, R(RSCRATCH), Imm8(31));
+        ROR_(32, R(RSCRATCH), R(ECX));
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+    }
+    SetJumpTarget(zero);
+
+    return R(RSCRATCH);
+}
+
+// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue
+OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed)
+{
+    carryUsed = true;
+
+    switch (op)
+    {
+    case 0: // LSL
+        if (amount > 0)
+        {
+            MOV(32, R(RSCRATCH), rm);
+            SHL(32, R(RSCRATCH), Imm8(amount));
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+
+            return R(RSCRATCH);
+        }
+        else
+        {
+            carryUsed = false;
+            return rm;
+        }
+    case 1: // LSR
+        if (amount > 0)
+        {
+            MOV(32, R(RSCRATCH), rm);
+            SHR(32, R(RSCRATCH), Imm8(amount));
+            if (S)
+                SETcc(CC_C, R(RSCRATCH2));
+            return R(RSCRATCH);
+        }
+        else
+        {
+            if (S)
+            {
+                MOV(32, R(RSCRATCH2), rm);
+                SHR(32, R(RSCRATCH2), Imm8(31));
+            }
+            return Imm32(0);
+        }
+    case 2: // ASR
+        MOV(32, R(RSCRATCH), rm);
+        SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31));
+        if (S)
+        {
+            if (amount == 0)
+                BT(32, rm, Imm8(31));
+            SETcc(CC_C, R(RSCRATCH2));
+        }
+        return R(RSCRATCH);
+    case 3: // ROR
+        MOV(32, R(RSCRATCH), rm);
+        if (amount > 0)
+            ROR_(32, R(RSCRATCH), Imm8(amount));
+        else
+        {
+            BT(32, R(RCPSR), Imm8(29));
+            RCR(32, R(RSCRATCH), Imm8(1));
+        }
+        if (S)
+            SETcc(CC_C, R(RSCRATCH2));
+        return R(RSCRATCH);
+    }
+
+    assert(false);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    int amount = (CurInstr.Instr >> 6) & 0x1F;
+
+    Comp_AddCycles_C();
+
+    bool carryUsed;
+    OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed);
+
+    if (shifted != rd)
+        MOV(32, rd, shifted);
+
+    if (FlagsNZRequired())
+        TEST(32, rd, rd);
+    Comp_RetriveFlags(false, false, carryUsed);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+
+    int op = (CurInstr.Instr >> 9) & 0x3;
+
+    OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6));
+    
+    Comp_AddCycles_C();
+
+    // special case for thumb mov being alias to add rd, rn, #0
+    if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
+    {
+        if (rd != rs)
+            MOV(32, rd, rs);
+    }
+    else if (op & 1)
+        Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
+    else
+        Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
+}
+
+void Compiler::T_Comp_ALU_Imm8()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+
+    u32 op = (CurInstr.Instr >> 11) & 0x3;
+    OpArg imm = Imm32(CurInstr.Instr & 0xFF);
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0:
+        MOV(32, rd, imm);
+        if (FlagsNZRequired())
+            TEST(32, rd, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    case 0x1:
+        Comp_CmpOp(2, rd, imm, false);
+        return;
+    case 0x2:
+        Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV);
+        return;
+    case 0x3:
+        Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV);
+        return;
+    }
+}
+
+void Compiler::T_Comp_MUL()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+    Comp_MulOp(true, false, rd, rd, rs, Imm8(-1));
+}
+
+void Compiler::T_Comp_ALU()
+{
+    OpArg rd = MapReg(CurInstr.T_Reg(0));
+    OpArg rs = MapReg(CurInstr.T_Reg(3));
+
+    u32 op = (CurInstr.Instr >> 6) & 0xF;
+
+    if ((op >= 0x2 && op < 0x4) || op == 0x7)
+        Comp_AddCycles_CI(1); // shift by reg
+    else
+        Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0: // AND
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x1: // EOR
+        Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0x2:
+    case 0x3:
+    case 0x4:
+    case 0x7:
+        {
+            int shiftOp = op == 0x7 ? 3 : op - 0x2;
+            bool carryUsed;
+            OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed);
+            if (FlagsNZRequired())
+                TEST(32, shifted, shifted);
+            MOV(32, rd, shifted);
+            Comp_RetriveFlags(false, false, true);
+        }
+        return;
+    case 0x5: // ADC
+        Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV);
+        return;
+    case 0x6: // SBC
+        Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV);
+        return;
+    case 0x8: // TST
+        Comp_CmpOp(0, rd, rs, false);
+        return;
+    case 0x9: // NEG
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NEG(32, rd);
+        Comp_RetriveFlags(true, true, false);
+        return;
+    case 0xA: // CMP
+        Comp_CmpOp(2, rd, rs, false);
+        return;
+    case 0xB: // CMN
+        Comp_CmpOp(3, rd, rs, false);
+        return;
+    case 0xC: // ORR
+        Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric);
+        return;
+    case 0xE: // BIC
+        Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2);
+        return;
+    case 0xF: // MVN
+        if (rd != rs)
+            MOV(32, rd, rs);
+        NOT(32, rd);
+        Comp_RetriveFlags(false, false, false);
+        return;
+    default:
+        break;
+    }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+    u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+    OpArg rdMapped = MapReg(rd);
+    OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
+
+    u32 op = (CurInstr.Instr >> 8) & 0x3;
+
+    Comp_AddCycles_C();
+
+    switch (op)
+    {
+    case 0x0: // ADD
+        Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric);
+        break;
+    case 0x1: // CMP
+        Comp_CmpOp(2, rdMapped, rs, false);
+        return; // this is on purpose
+    case 0x2: // MOV
+        if (rdMapped != rs)
+            MOV(32, rdMapped, rs);
+        break;
+    }
+
+    if (rd == 15)
+    {
+        OR(32, rdMapped, Imm8(1));
+        Comp_JumpTo(rdMapped.GetSimpleReg());
+    }
+}
+
+void Compiler::T_Comp_AddSP()
+{
+    Comp_AddCycles_C();
+
+    OpArg sp = MapReg(13);
+    OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2);
+    if (CurInstr.Instr & (1 << 7))
+        SUB(32, sp, offset);
+    else
+        ADD(32, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.T_Reg(8));
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    if (CurInstr.Instr & (1 << 11))
+    {
+        OpArg sp = MapReg(13);
+        LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset));
+    }
+    else
+        MOV(32, rd, Imm32((R15 & ~2) + offset));
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..bda9e52
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -0,0 +1,272 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
+    
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+    // we can simplify constant branches by a lot
+    IrregularCycles = true;
+
+    u32 newPC;
+    u32 cycles = 0;
+
+    if (addr & 0x1 && !Thumb)
+    {
+        CPSRDirty = true;
+        OR(32, R(RCPSR), Imm8(0x20));
+    }
+    else if (!(addr & 0x1) && Thumb)
+    {
+        CPSRDirty = true;
+        AND(32, R(RCPSR), Imm32(~0x20));
+    }
+
+    if (Num == 0)
+    {
+        ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+        u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+        u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
+        cpu9->RegionCodeCycles = regionCodeCycles;
+
+        if (Exit)
+            MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // two-opcodes-at-once fetch
+            // doesn't matter if we put garbage in the MSbs there
+            if (addr & 0x2)
+            {
+                cpu9->CodeRead32(addr-2, true);
+                cycles += cpu9->CodeCycles;
+                cpu9->CodeRead32(addr+2, false);
+                cycles += CurCPU->CodeCycles;
+            }
+            else
+            {
+                cpu9->CodeRead32(addr, true);
+                cycles += cpu9->CodeCycles;
+            }
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            cpu9->CodeRead32(addr, true);
+            cycles += cpu9->CodeCycles;
+            cpu9->CodeRead32(addr+4, false);
+            cycles += cpu9->CodeCycles;
+        }
+
+        cpu9->RegionCodeCycles = compileTimeCodeCycles;
+    }
+    else
+    {
+        ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+        u32 codeRegion = addr >> 24;
+        u32 codeCycles = addr >> 15; // cheato
+
+        cpu7->CodeRegion = codeRegion;
+        cpu7->CodeCycles = codeCycles;
+
+        if (Exit)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        }
+
+        if (addr & 0x1)
+        {
+            addr &= ~0x1;
+            newPC = addr+2;
+
+            // this is necessary because ARM7 bios protection
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+        else
+        {
+            addr &= ~0x3;
+            newPC = addr+4;
+
+            u32 compileTimePC = CurCPU->R[15];
+            CurCPU->R[15] = newPC;
+
+            cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+            CurCPU->R[15] = compileTimePC;
+        }
+
+        cpu7->CodeRegion = R15 >> 24;
+        cpu7->CodeCycles = addr >> 15;
+    }
+
+    if (Exit)
+        MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+        ConstantCycles += cycles;
+    else
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+}
+
+void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
+{
+    IrregularCycles = true;
+
+    bool cpsrDirty = CPSRDirty;
+    SaveCPSR();
+
+    PushRegs(restoreCPSR);
+
+    MOV(64, R(ABI_PARAM1), R(RCPU));
+    MOV(32, R(ABI_PARAM2), R(addr));
+    if (!restoreCPSR)
+        XOR(32, R(ABI_PARAM3), R(ABI_PARAM3));
+    else
+        MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste
+    if (Num == 0)
+        CALL((void*)&ARMv5::JumpTo);
+    else
+        CALL((void*)&ARMv4::JumpTo);
+
+    PopRegs(restoreCPSR);
+
+    LoadCPSR();
+    // in case this instruction is skipped
+    if (CurInstr.Cond() < 0xE)
+        CPSRDirty = cpsrDirty;
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+    int op = (CurInstr.Instr >> 24) & 1;
+    s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+    u32 target = R15 + offset;
+    bool link = op;
+
+    if (CurInstr.Cond() == 0xF) // BLX_imm
+    {
+        target += (op << 1) + 1;
+        link = true;
+    }
+
+    if (link)
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+
+    Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+    OpArg rn = MapReg(CurInstr.A_Reg(0));
+    MOV(32, R(RSCRATCH), rn);
+    if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+        MOV(32, MapReg(14), Imm32(R15 - 4));
+    Comp_JumpTo(RSCRATCH);
+}
+
+void Compiler::T_Comp_BCOND()
+{
+    u32 cond = (CurInstr.Instr >> 8) & 0xF;
+    FixupBranch skipExecute = CheckCondition(cond);
+
+    s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+    Comp_JumpTo(R15 + offset + 1, true);
+
+    Comp_SpecialBranchBehaviour(true);
+
+    FixupBranch skipFailed = J();
+    SetJumpTarget(skipExecute);
+
+    Comp_SpecialBranchBehaviour(false);
+
+    Comp_AddCycles_C(true);
+    SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+    Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+    bool link = CurInstr.Instr & (1 << 7);
+
+    if (link)
+    {
+        if (Num == 1)
+        {
+            printf("BLX unsupported on ARM7!!!\n");
+            return;
+        }
+        MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3)));
+        MOV(32, MapReg(14), Imm32(R15 - 1));
+        Comp_JumpTo(RSCRATCH);
+    }
+    else
+    {
+        OpArg rn = MapReg(CurInstr.A_Reg(3));
+        Comp_JumpTo(rn.GetSimpleReg());
+    }
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+    s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+    MOV(32, MapReg(14), Imm32(R15 + offset));
+    Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+    OpArg lr = MapReg(14);
+    s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+    LEA(32, RSCRATCH, MDisp(lr.GetSimpleReg(), offset));
+    MOV(32, lr, Imm32((R15 - 2) | 1));
+    if (Num == 1 || CurInstr.Instr & (1 << 12))
+        OR(32, R(RSCRATCH), Imm8(1));
+    Comp_JumpTo(RSCRATCH);
+}
+
+void Compiler::T_Comp_BL_Merged()
+{
+    Comp_AddCycles_C();
+
+    R15 += 2;
+
+    u32 upperPart = CurInstr.Instr >> 16;
+    u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+    target += (upperPart & 0x7FF) << 1;
+
+    if (Num == 1 || upperPart & (1 << 12))
+        target |= 1;
+
+    MOV(32, MapReg(14), Imm32((R15 - 2) | 1));
+    
+    Comp_JumpTo(target);
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..d8bdd56
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -0,0 +1,899 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+#include "../Config.h"
+
+#include <assert.h>
+
+#include "../dolphin/CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+using namespace Gen;
+
+extern "C" void ARM_Ret();
+
+namespace ARMJIT
+{
+template <>
+const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
+{
+#ifdef _WIN32
+    RBX, RSI, RDI, R12, R13, R14, // callee saved
+    R10, R11, // caller saved
+#else
+    RBX, R12, R13, R14, // callee saved, this is sad
+    R9, R10, R11, // caller saved
+#endif
+};
+template <>
+const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
+#ifdef _WIN32
+    8
+#else
+    7
+#endif
+;
+
+#ifdef _WIN32
+const BitSet32 CallerSavedPushRegs({R10, R11});
+#else
+const BitSet32 CallerSavedPushRegs({R9, R10, R11});
+#endif
+
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+
+    if (saveHiRegs)
+    {
+        BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+        for (int reg : hiRegsLoaded)
+        {
+            if (Thumb || CurInstr.Cond() == 0xE)
+                RegCache.UnloadRegister(reg);
+            else
+                SaveReg(reg, RegCache.Mapping[reg]);
+            // prevent saving the register twice
+            loadedRegs[reg] = false;
+        }
+    }
+
+    for (int reg : loadedRegs)
+        if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+            SaveReg(reg, RegCache.Mapping[reg]);
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    BitSet32 loadedRegs(RegCache.LoadedRegs);
+    for (int reg : loadedRegs)
+    {
+        if ((saveHiRegs && reg >= 8 && reg < 15)
+            || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
+        {
+            LoadReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    OpArg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
+        CALL(ReadBanked);
+        MOV(32, rd, R(RSCRATCH3));
+    }
+    else
+        MOV(32, rd, R(RCPSR));
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    OpArg val = CurInstr.Instr & (1 << 25)
+        ? Imm32(ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)))
+        : MapReg(CurInstr.A_Reg(0));
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
+        CALL(ReadBanked);
+
+        MOV(32, R(RSCRATCH2), Imm32(mask));
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00));
+        MOV(32, R(RSCRATCH), R(RCPSR));
+        AND(32, R(RSCRATCH), Imm8(0x1F));
+        CMP(32, R(RSCRATCH), Imm8(0x10));
+        CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E);
+
+        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
+        NOT(32, R(RSCRATCH4));
+        AND(32, R(RSCRATCH3), R(RSCRATCH4));
+
+        AND(32, R(RSCRATCH2), val);
+        OR(32, R(RSCRATCH3), R(RSCRATCH2));
+
+        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
+        CALL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            AND(32, R(RCPSR), Imm32(~mask));
+            if (!val.IsImm())
+            {
+                MOV(32, R(RSCRATCH), val);
+                AND(32, R(RSCRATCH), Imm32(mask));
+                OR(32, R(RCPSR), R(RSCRATCH));
+            }
+            else
+            {
+                OR(32, R(RCPSR), Imm32(val.Imm32() & mask));
+            }
+        }
+        else
+        {
+            MOV(32, R(RSCRATCH2), Imm32(mask));
+            MOV(32, R(RSCRATCH3), R(RSCRATCH2));
+            AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00));
+            MOV(32, R(RSCRATCH), R(RCPSR));
+            AND(32, R(RSCRATCH), Imm8(0x1F));
+            CMP(32, R(RSCRATCH), Imm8(0x10));
+            CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E);
+
+            MOV(32, R(RSCRATCH3), R(RCPSR));
+
+            // I need you ANDN
+            MOV(32, R(RSCRATCH), R(RSCRATCH2));
+            NOT(32, R(RSCRATCH));
+            AND(32, R(RCPSR), R(RSCRATCH));
+
+            AND(32, R(RSCRATCH2), val);
+            OR(32, R(RCPSR), R(RSCRATCH2));
+
+            PushRegs(true);
+
+            MOV(32, R(ABI_PARAM3), R(RCPSR));
+            MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
+            MOV(64, R(ABI_PARAM1), R(RCPU));
+            CALL((void*)&ARM::UpdateMode);
+
+            PopRegs(true);
+        }
+    }
+}
+
+/*
+    We'll repurpose this .bss memory
+
+ */
+u8 CodeMemory[1024 * 1024 * 32];
+
+Compiler::Compiler()
+{
+    {
+    #ifdef _WIN32
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+
+        u64 pageSize = (u64)sysInfo.dwPageSize;
+    #else
+        u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    #endif
+
+        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
+        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;
+
+    #ifdef _WIN32
+        DWORD dummy;
+        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
+    #else
+        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+    #endif
+
+        ResetStart = pageAligned;
+        CodeMemSize = alignedSize;
+    }
+
+    Reset();
+
+    {
+        // RSCRATCH mode
+        // RSCRATCH2 reg number
+        // RSCRATCH3 value in current mode
+        // ret - RSCRATCH3
+        ReadBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)));
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)));
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)));
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)));
+        RET();
+        SetJumpTarget(und);
+        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)));
+        RET();
+    }
+    {
+        // RSCRATCH  mode
+        // RSCRATCH2 reg n
+        // RSCRATCH3 value
+        // carry flag set if the register isn't banked
+        WriteBanked = (void*)GetWritableCodePtr();
+        CMP(32, R(RSCRATCH), Imm8(0x11));
+        FixupBranch fiq = J_CC(CC_E);
+        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
+        FixupBranch notEverything = J_CC(CC_L);
+        CMP(32, R(RSCRATCH), Imm8(0x12));
+        FixupBranch irq = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x13));
+        FixupBranch svc = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x17));
+        FixupBranch abt = J_CC(CC_E);
+        CMP(32, R(RSCRATCH), Imm8(0x1B));
+        FixupBranch und = J_CC(CC_E);
+        SetJumpTarget(notEverything);
+        STC();
+        RET();
+
+        SetJumpTarget(fiq);
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3));
+        CLC();
+        RET();
+        SetJumpTarget(irq);
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3));
+        CLC();
+        RET();
+        SetJumpTarget(svc);
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3));
+        CLC();
+        RET();
+        SetJumpTarget(abt);
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3));
+        CLC();
+        RET();
+        SetJumpTarget(und);
+        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3));
+        CLC();
+        RET();
+    }
+
+    for (int consoleType = 0; consoleType < 2; consoleType++)
+    {
+        for (int num = 0; num < 2; num++)
+        {
+            for (int size = 0; size < 3; size++)
+            {
+                for (int reg = 0; reg < 16; reg++)
+                {
+                    if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3)
+                    {
+                        PatchedStoreFuncs[consoleType][num][size][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL;
+                        continue;
+                    }
+
+                    X64Reg rdMapped = (X64Reg)reg;
+                    PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr();
+                    if (RSCRATCH3 != ABI_PARAM1)
+                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                    if (num == 0)
+                    {
+                        MOV(64, R(ABI_PARAM2), R(RCPU));
+                        MOV(32, R(ABI_PARAM3), R(rdMapped));
+                    }
+                    else
+                    {
+                        MOV(32, R(ABI_PARAM2), R(rdMapped));
+                    }
+                    ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    if (consoleType == 0)
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 0>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 0>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 0>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 0>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 0>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 0>); break;
+                        }
+                    }
+                    else
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 1>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 1>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 1>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 1>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 1>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 1>); break;
+                        }
+                    }
+                    ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    RET();
+
+                    for (int signextend = 0; signextend < 2; signextend++)
+                    {
+                        PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr();
+                        if (RSCRATCH3 != ABI_PARAM1)
+                            MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                        if (num == 0)
+                            MOV(64, R(ABI_PARAM2), R(RCPU));
+                        ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (consoleType == 0)
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 0>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 0>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 0>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 0>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 0>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 0>); break;
+                            }
+                        }
+                        else
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 1>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 1>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 1>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 1>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 1>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 1>); break;
+                            }
+                        }
+                        ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (signextend)
+                            MOVSX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        else
+                            MOVZX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        RET();
+                    }
+                }
+            }
+        }
+    }
+
+    // move the region forward to prevent overwriting the generated functions
+    CodeMemSize -= GetWritableCodePtr() - ResetStart;
+    ResetStart = GetWritableCodePtr();
+
+    NearStart = ResetStart;
+    FarStart = ResetStart + 1024*1024*24;
+
+    NearSize = FarStart - ResetStart;
+    FarSize = (ResetStart + CodeMemSize) - FarStart;
+}
+
+void Compiler::LoadCPSR()
+{
+    assert(!CPSRDirty);
+
+    MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
+}
+
+void Compiler::SaveCPSR(bool flagClean)
+{
+    if (CPSRDirty)
+    {
+        MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
+        if (flagClean)
+            CPSRDirty = false;
+    }
+}
+
+void Compiler::LoadReg(int reg, X64Reg nativeReg)
+{
+    if (reg != 15)
+        MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg])));
+    else
+        MOV(32, R(nativeReg), Imm32(R15));
+}
+
+void Compiler::SaveReg(int reg, X64Reg nativeReg)
+{
+    MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
+}
+
+// invalidates RSCRATCH and RSCRATCH3
+Gen::FixupBranch Compiler::CheckCondition(u32 cond)
+{
+    // hack, ldm/stm can get really big TODO: make this better
+    bool ldmStm = !Thumb &&
+        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
+    if (cond >= 0x8)
+    {
+        static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
+        MOV(32, R(RSCRATCH3), R(RCPSR));
+        SHR(32, R(RSCRATCH3), Imm8(28));
+        MOV(32, R(RSCRATCH), Imm32(1));
+        SHL(32, R(RSCRATCH), R(RSCRATCH3));
+        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
+
+        return J_CC(CC_Z, ldmStm);
+    }
+    else
+    {
+        // could have used a LUT, but then where would be the fun?
+        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
+
+        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
+    }
+}
+
+#define F(x) &Compiler::x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+    // AND
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // EOR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SUB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSB
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADD
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ADC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // SBC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // RSC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // ORR
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MOV
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // BIC
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
+    // MVN
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
+    // TST
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // TEQ
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMP
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // CMN
+    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
+    // Mul
+    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL,
+    // ARMv5 stuff
+    F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
+    // STR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDR
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // LDRB
+    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
+    // STRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    // LDRH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSB
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // LDRSH
+    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
+    // swap
+    NULL, NULL,
+    // LDM/STM
+    F(A_Comp_LDM_STM), F(A_Comp_LDM_STM),
+    // Branch
+    F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
+    // system stuff
+    NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL,
+    F(Nop)
+};
+
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
+    // Shift imm
+    F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm),
+    // Three operand ADD/SUB
+    F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_),
+    // 8 bit imm
+    F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8),
+    // general ALU
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
+    F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU),
+    // hi reg
+    F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg),
+    // pc/sp relative
+    F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP),
+    // LDR pcrel
+    F(T_Comp_LoadPCRel),
+    // LDR/STR reg offset
+    F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg),
+    // LDR/STR sign extended, half
+    F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf),
+    // LDR/STR imm offset
+    F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm),
+    // LDR/STR half imm offset
+    F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf),
+    // LDR/STR sp rel
+    F(T_Comp_MemSPRel), F(T_Comp_MemSPRel),
+    // PUSH/POP
+    F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), 
+    // LDMIA, STMIA
+    F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), 
+    // Branch
+    F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
+    // Unk, SVC
+    NULL, NULL,
+    F(T_Comp_BL_Merged)
+};
+#undef F
+
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
+void Compiler::Reset()
+{
+    memset(ResetStart, 0xcc, CodeMemSize);
+    SetCodePtr(ResetStart);
+
+    NearCode = NearStart;
+    FarCode = FarStart;
+
+    LoadStorePatches.clear();
+}
+
+bool Compiler::IsJITFault(u64 addr)
+{
+    return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory);
+}
+
+void Compiler::Comp_SpecialBranchBehaviour(bool taken)
+{
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
+
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
+    {
+        RegCache.PrepareExit();
+
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+        JMP((u8*)&ARM_Ret, true);
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+{
+    if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess...
+    {
+        printf("near reset\n");
+        ResetBlockCache();
+    }
+    if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess...
+    {
+        printf("far reset\n");
+        ResetBlockCache();
+    }
+
+    ConstantCycles = 0;
+    Thumb = thumb;
+    Num = cpu->Num;
+    CodeRegion = instrs[0].Addr >> 24;
+    CurCPU = cpu;
+    // CPSR might have been modified in a previous block
+    CPSRDirty = false;
+
+    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
+
+    RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
+
+    for (int i = 0; i < instrsCount; i++)
+    {
+        CurInstr = instrs[i];
+        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
+
+        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
+        CompileFunc comp = Thumb
+            ? T_Comp[CurInstr.Info.Kind]
+            : A_Comp[CurInstr.Info.Kind];
+
+        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
+            if (comp == NULL)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
+
+                SaveCPSR();
+            }
+        }
+
+        if (comp != NULL)
+            RegCache.Prepare(Thumb, i);
+        else
+            RegCache.Flush();
+
+        if (Thumb)
+        {
+            if (comp == NULL)
+            {
+                MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
+            }
+            else
+                (this->*comp)();
+        }
+        else
+        {
+            u32 cond = CurInstr.Cond();
+            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+            {
+                if (comp)
+                    (this->*comp)();
+                else
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+                    ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
+                }
+            }
+            else if (cond == 0xF)
+            {
+                Comp_AddCycles_C();
+            }
+            else
+            {
+                IrregularCycles = false;
+
+                FixupBranch skipExecute;
+                if (cond < 0xE)
+                    skipExecute = CheckCondition(cond);
+
+                if (comp == NULL)
+                {
+                    MOV(64, R(ABI_PARAM1), R(RCPU));
+
+                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
+                }
+                else
+                    (this->*comp)();
+
+                Comp_SpecialBranchBehaviour(true);
+
+                if (CurInstr.Cond() < 0xE)
+                {
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
+                    {
+                        FixupBranch skipFailed = J();
+                        SetJumpTarget(skipExecute);
+
+                        Comp_AddCycles_C(true);
+
+                        Comp_SpecialBranchBehaviour(false);
+
+                        SetJumpTarget(skipFailed);
+                    }
+                    else
+                        SetJumpTarget(skipExecute);
+                }
+                
+            }
+        }
+
+        if (comp == NULL)
+            LoadCPSR();
+    }
+
+    RegCache.Flush();
+
+    SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+    JMP((u8*)ARM_Ret, true);
+
+    /*FILE* codeout = fopen("codeout", "a");
+    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
+    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
+
+    fclose(codeout);*/
+
+    return res;
+}
+
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+
+    if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+void Compiler::Comp_AddCycles_CI(u32 i)
+{
+    s32 cycles = (Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
+
+    if (!Thumb && CurInstr.Cond() < 0xE)
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
+{
+    s32 cycles = Num ?
+        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+    
+    if (!Thumb && CurInstr.Cond() < 0xE)
+    {
+        LEA(32, RSCRATCH, MDisp(i, add + cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+    }
+    else
+    {
+        ConstantCycles += cycles;
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+    }
+}
+
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if ((CurInstr.DataRegion >> 4) == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..0fe0147
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -0,0 +1,255 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../dolphin/x64Emitter.h"
+
+#include "../ARMJIT.h"
+#include "../ARMJIT_Internal.h"
+#include "../ARMJIT_RegisterCache.h"
+
+#include <unordered_map>
+
+namespace ARMJIT
+{
+
+const Gen::X64Reg RCPU = Gen::RBP;
+const Gen::X64Reg RCPSR = Gen::R15;
+
+const Gen::X64Reg RSCRATCH = Gen::EAX;
+const Gen::X64Reg RSCRATCH2 = Gen::EDX;
+const Gen::X64Reg RSCRATCH3 = Gen::ECX;
+const Gen::X64Reg RSCRATCH4 = Gen::R8;
+
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s16 Offset;
+    u16 Size;
+};
+
+struct Op2
+{
+    Op2()
+    {}
+
+    Op2(u32 imm)
+        : IsImm(true), Imm(imm)
+    {}
+    Op2(int reg, int op, int amount)
+        : IsImm(false)
+    {
+        Reg.Reg = reg;
+        Reg.Op = op;
+        Reg.Amount = amount;
+    }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            int Reg, Op, Amount;
+        } Reg;
+        u32 Imm;
+    };
+};
+
+class Compiler : public Gen::XEmitter
+{
+public:
+    Compiler();
+
+    void Reset();
+
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+
+    void LoadReg(int reg, Gen::X64Reg nativeReg);
+    void SaveReg(int reg, Gen::X64Reg nativeReg);
+
+    bool CanCompile(bool thumb, u16 kind);
+
+    typedef void (Compiler::*CompileFunc)();
+
+    void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false);
+    void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
+
+    void Comp_AddCycles_C(bool forceNonConstant = false);
+    void Comp_AddCycles_CI(u32 i);
+    void Comp_AddCycles_CI(Gen::X64Reg i, int add);
+    void Comp_AddCycles_CDI();
+    void Comp_AddCycles_CD();
+
+    enum
+    {
+        opSetsFlags = 1 << 0,
+        opSymmetric = 1 << 1,
+        opRetriveCV = 1 << 2,
+        opInvertCarry = 1 << 3,
+        opSyncCarry = 1 << 4,
+        opInvertOp2 = 1 << 5,
+    };
+
+    void Nop() {}
+
+    void A_Comp_Arith();
+    void A_Comp_MovOp();
+    void A_Comp_CmpOp();
+
+    void A_Comp_MUL_MLA();
+    void A_Comp_Mul_Long();
+
+    void A_Comp_CLZ();
+    
+    void A_Comp_MemWB();
+    void A_Comp_MemHalf();
+    void A_Comp_LDM_STM();
+
+    void A_Comp_BranchImm();
+    void A_Comp_BranchXchangeReg();
+
+    void A_Comp_MRS();
+    void A_Comp_MSR();
+
+    void T_Comp_ShiftImm();
+    void T_Comp_AddSub_();
+    void T_Comp_ALU_Imm8();
+    void T_Comp_ALU();
+    void T_Comp_ALU_HiReg();
+    void T_Comp_MUL();
+
+    void T_Comp_RelAddr();
+    void T_Comp_AddSP();
+
+    void T_Comp_MemReg();
+    void T_Comp_MemImm();
+    void T_Comp_MemRegHalf();
+    void T_Comp_MemImmHalf();
+    void T_Comp_LoadPCRel();
+    void T_Comp_MemSPRel();
+    void T_Comp_PUSH_POP();
+    void T_Comp_LDMIA_STMIA();
+
+    void T_Comp_BCOND();
+    void T_Comp_B();
+    void T_Comp_BranchXchangeReg();
+    void T_Comp_BL_LONG_1();
+    void T_Comp_BL_LONG_2();
+    void T_Comp_BL_Merged();
+
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags);
+    s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+
+    void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
+        Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
+    void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed);
+
+    void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn);
+
+    void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
+
+    void Comp_SpecialBranchBehaviour(bool taken);
+
+
+    Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed);
+    Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
+
+    Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
+
+    void LoadCPSR();
+    void SaveCPSR(bool flagClean = true);
+
+    bool FlagsNZRequired()
+    { return CurInstr.SetFlags & 0xC; }
+
+    Gen::FixupBranch CheckCondition(u32 cond);
+
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
+    Gen::OpArg MapReg(int reg)
+    {
+        if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG)
+            return Gen::Imm32(R15);
+
+        assert(RegCache.Mapping[reg] != Gen::INVALID_REG);
+        return Gen::R(RegCache.Mapping[reg]);
+    }
+
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(ResetStart + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - ResetStart;
+    }
+
+    void SwitchToNearCode()
+    {
+        FarCode = GetWritableCodePtr();
+        SetCodePtr(NearCode);
+    }
+
+    void SwitchToFarCode()
+    {
+        NearCode = GetWritableCodePtr();
+        SetCodePtr(FarCode);
+    }
+
+    bool IsJITFault(u64 addr);
+
+    s32 RewriteMemAccess(u64 pc);
+
+    u8* FarCode;
+    u8* NearCode;
+    u32 FarSize;
+    u32 NearSize;
+
+    u8* NearStart;
+    u8* FarStart;
+
+    void* PatchedStoreFuncs[2][2][3][16];
+    void* PatchedLoadFuncs[2][2][3][2][16];
+
+    std::unordered_map<u8*, LoadStorePatch> LoadStorePatches;
+
+    u8* ResetStart;
+    u32 CodeMemSize;
+
+    bool Exit;
+    bool IrregularCycles;
+
+    void* ReadBanked;
+    void* WriteBanked;
+
+    bool CPSRDirty = false;
+
+    FetchedInstr CurInstr;
+
+    RegisterCache<Compiler, Gen::X64Reg> RegCache;
+
+    bool Thumb;
+    u32 Num;
+    u32 R15;
+    u32 CodeRegion;
+
+    u32 ConstantCycles;
+
+    ARM* CurCPU;
+};
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
new file mode 100644
index 0000000..9696d22
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -0,0 +1,15 @@
+#include "../ARM.h"
+
+int main(int argc, char* argv[])
+{
+    FILE* f = fopen("ARMJIT_Offsets.h", "w");
+#define writeOffset(field) \
+        fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field))
+
+    writeOffset(CPSR);
+    writeOffset(Cycles);
+    writeOffset(StopExecution);
+
+    fclose(f);
+    return 0;
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..0a84df0
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -0,0 +1,78 @@
+.intel_syntax noprefix
+
+#include "ARMJIT_Offsets.h"
+
+.text
+
+#define RCPU rbp
+#define RCPSR r15d
+
+#ifdef WIN64
+#define ARG1_REG ecx
+#define ARG2_REG edx
+#define ARG3_REG r8d
+#define ARG4_REG r9d
+#define ARG1_REG64 rcx
+#define ARG2_REG64 rdx
+#define ARG3_REG64 r8
+#define ARG4_REG64 r9
+#else
+#define ARG1_REG edi
+#define ARG2_REG esi
+#define ARG3_REG edx
+#define ARG4_REG ecx
+#define ARG1_REG64 rdi
+#define ARG2_REG64 rsi
+#define ARG3_REG64 rdx
+#define ARG4_REG64 rcx
+#endif
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+#ifdef WIN64
+    push rdi
+    push rsi
+#endif
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+    push rbp
+
+#ifdef WIN64
+    sub rsp, 0x28
+#else
+    sub rsp, 0x8
+#endif
+    mov RCPU, ARG1_REG64
+    mov RCPSR, [RCPU + ARM_CPSR_offset]
+
+    jmp ARG2_REG64
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    mov [RCPU + ARM_CPSR_offset], RCPSR
+
+#ifdef WIN64
+    add rsp, 0x28
+#else
+    add rsp, 0x8
+#endif
+
+    pop rbp
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+#ifdef WIN64
+    pop rsi
+    pop rdi
+#endif
+
+    ret
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..2da113b
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,773 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../Config.h"
+
+using namespace Gen;
+
+namespace ARMJIT
+{
+
+template <typename T>
+int squeezePointer(T* ptr)
+{
+    int truncated = (int)((u64)ptr);
+    assert((T*)((u64)truncated) == ptr);
+    return truncated;
+}
+
+s32 Compiler::RewriteMemAccess(u64 pc)
+{
+    auto it = LoadStorePatches.find((u8*)pc);
+    if (it != LoadStorePatches.end())
+    {
+        LoadStorePatch patch = it->second;
+        LoadStorePatches.erase(it);
+
+        u8* curCodePtr = GetWritableCodePtr();
+        u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset;
+        SetCodePtr(rewritePtr);
+
+        CALL(patch.PatchFunc);
+        u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr);
+        if (remainingSize > 0)
+            NOP(remainingSize);
+
+        //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size);
+
+        SetCodePtr(curCodePtr);
+
+        return patch.Offset;
+    }
+
+    printf("this is a JIT bug %x\n", pc);
+    abort();
+}
+
+/*
+    According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
+    of all memory load and store instructions always access addresses in the same region as
+    during the their first execution.
+
+    I tried multiple optimisations, which would benefit from this behaviour
+    (having fast paths for the first region, …), though none of them yielded a measureable
+    improvement.
+*/
+
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
+{
+    u32 localAddr = LocaliseCodeAddress(Num, addr);
+
+    int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
+    if (invalidLiteralIdx != -1)
+    {
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
+    }
+
+    Comp_AddCycles_CDI();
+
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+    {
+        CurCPU->DataRead16(addr & ~0x1, &val);
+        if (signExtend)
+            val = ((s32)val << 16) >> 16;
+    }
+    else
+    {
+        CurCPU->DataRead8(addr, &val);
+        if (signExtend)
+            val = ((s32)val << 24) >> 24;
+    }
+    CurCPU->R[15] = tmpR15;
+
+    MOV(32, MapReg(rd), Imm32(val));
+
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+    
+    return true;
+}
+
+
+void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags)
+{
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        
+        if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
+            return;
+    }
+
+    if (flags & memop_Store)
+    {
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
+
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+    OpArg rdMapped = MapReg(rd);
+
+    OpArg rnMapped = MapReg(rn);
+    if (Thumb && rn == 15)
+        rnMapped = Imm32(R15 & ~0x2);
+
+    X64Reg finalAddr = RSCRATCH3;
+    if (flags & memop_Post)
+    {
+        MOV(32, R(RSCRATCH3), rnMapped);
+
+        finalAddr = rnMapped.GetSimpleReg();
+    }
+
+    if (op2.IsImm)
+    {
+        MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+    }
+    else
+    {
+        OpArg rm = MapReg(op2.Reg.Reg);
+
+        if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+            && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+        {
+            LEA(32, finalAddr, 
+                MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+        }
+        else
+        {
+            bool throwAway;
+            OpArg offset =
+                Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+
+            if (flags & memop_SubtractOffset)
+            {
+                if (R(finalAddr) != rnMapped)
+                    MOV(32, R(finalAddr), rnMapped);
+                if (!offset.IsZero())
+                    SUB(32, R(finalAddr), offset);
+            }
+            else
+                MOV_sum(32, finalAddr, rnMapped, offset);
+        }
+    }
+
+    if ((flags & memop_Writeback) && !(flags & memop_Post))
+        MOV(32, rnMapped, R(finalAddr));
+
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
+    {
+        u8* memopStart = GetWritableCodePtr();
+        LoadStorePatch patch;
+
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()]
+            : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()];
+
+        assert(patch.PatchFunc != NULL);
+
+        MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
+
+        X64Reg maskedAddr = RSCRATCH3;
+        if (size > 8)
+        {
+            maskedAddr = RSCRATCH2;
+            MOV(32, R(RSCRATCH2), R(RSCRATCH3));
+            AND(32, R(RSCRATCH2), Imm8(addressMask));
+        }
+
+        u8* memopLoadStoreLocation = GetWritableCodePtr();
+        if (flags & memop_Store)
+        {
+            MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped);
+        }
+        else
+        {
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
+
+            if (size == 32)
+            {
+                AND(32, R(RSCRATCH3), Imm8(0x3));
+                SHL(32, R(RSCRATCH3), Imm8(3));
+                ROR_(32, rdMapped, R(RSCRATCH3));
+            }
+        }
+
+        patch.Offset = memopStart - memopLoadStoreLocation;
+        patch.Size = GetWritableCodePtr() - memopStart;
+
+        assert(patch.Size >= 5);
+
+        LoadStorePatches[memopLoadStoreLocation] = patch;
+    }
+    else
+    {
+        PushRegs(false);
+
+        if (Num == 0)
+        {
+            MOV(64, R(ABI_PARAM2), R(RCPU));
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
+            {
+                MOV(32, R(ABI_PARAM3), rdMapped);
+
+                switch (size | NDS::ConsoleType)
+                {
+                case 32: CALL((void*)&SlowWrite9<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite9<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite9<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite9<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite9<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite9<u8, 1>); break;
+                }
+            }
+            else
+            {
+                switch (size | NDS::ConsoleType)
+                {
+                case 32: CALL((void*)&SlowRead9<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead9<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead9<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead9<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead9<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead9<u8, 1>); break;
+                }
+            }
+        }
+        else
+        {
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
+            {
+                MOV(32, R(ABI_PARAM2), rdMapped);
+
+                switch (size | NDS::ConsoleType)
+                {
+                case 32: CALL((void*)&SlowWrite7<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite7<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite7<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite7<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite7<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite7<u8, 1>); break;
+                }
+            }
+            else
+            {
+                switch (size | NDS::ConsoleType)
+                {
+                case 32: CALL((void*)&SlowRead7<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead7<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead7<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead7<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead7<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead7<u8, 1>); break;
+                }
+            }
+        }
+
+        PopRegs(false);
+
+        if (!(flags & memop_Store))
+        {
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+    }
+
+    if (!(flags & memop_Store) && rd == 15)
+    {
+        if (size < 32)
+            printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+        {
+            if (Num == 1)
+            {
+                if (Thumb)
+                    OR(32, rdMapped, Imm8(0x1));
+                else
+                    AND(32, rdMapped, Imm8(0xFE));
+            }
+            Comp_JumpTo(rdMapped.GetSimpleReg());
+        }
+    }
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+    int regsCount = regs.Count();
+
+    if (regsCount == 0)
+        return 0; // actually not the right behaviour TODO: fix me
+
+    if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
+    {
+        int flags = 0;
+        if (store)
+            flags |= memop_Store;
+        if (decrement && preinc)
+            flags |= memop_SubtractOffset;
+        Op2 offset = preinc ? Op2(4) : Op2(0);
+
+        Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+        return decrement ? -4 : 4;
+    }
+
+    s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
+
+    int expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+    if (!store)
+        Comp_AddCycles_CDI();
+    else
+        Comp_AddCycles_CD();
+
+    bool compileFastPath = Config::JIT_FastMemory
+        && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
+
+    // we need to make sure that the stack stays aligned to 16 bytes
+#ifdef _WIN32
+    // include shadow
+    u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#else
+    u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#endif
+    u32 allocOffset = stackAlloc - regsCount * 8;
+
+    if (decrement)
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4)));
+    else
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0));
+
+    if (compileFastPath)
+    {
+        AND(32, R(RSCRATCH4), Imm8(~3));
+
+        u8* fastPathStart = GetWritableCodePtr();
+        u8* firstLoadStoreAddr;
+
+        bool firstLoadStore = true;
+
+        MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH4));
+        MOV(32, R(RSCRATCH3), R(RSCRATCH4));
+
+        u32 offset = 0;
+        for (int reg : regs)
+        {
+            if (firstLoadStore)
+                firstLoadStoreAddr = GetWritableCodePtr();
+
+            OpArg mem = MDisp(RSCRATCH2, offset);
+            if (store)
+            {
+                if (RegCache.LoadedRegs & (1 << reg))
+                {
+                    MOV(32, mem, MapReg(reg));
+                }
+                else
+                {
+                    LoadReg(reg, RSCRATCH);
+                    if (firstLoadStore)
+                        firstLoadStoreAddr = GetWritableCodePtr();
+                    MOV(32, mem, R(RSCRATCH));
+                }
+            }
+            else
+            {
+                if (RegCache.LoadedRegs & (1 << reg))
+                {
+                    MOV(32, MapReg(reg), mem);
+                }
+                else
+                {
+                    MOV(32, R(RSCRATCH), mem);
+                    SaveReg(reg, RSCRATCH);
+                }
+            }
+            offset += 4;
+
+            firstLoadStore = false;
+        }
+
+        LoadStorePatch patch;
+        patch.Size = GetWritableCodePtr() - fastPathStart;
+        patch.Offset = fastPathStart - firstLoadStoreAddr;
+        SwitchToFarCode();
+        patch.PatchFunc = GetWritableCodePtr();
+
+        LoadStorePatches[firstLoadStoreAddr] = patch;
+    }
+
+    if (!store)
+    {
+        PushRegs(false);
+
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+        if (allocOffset == 0)
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        else
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
+
+        switch (Num * 2 | NDS::ConsoleType)
+        {
+        case 0: CALL((void*)&SlowBlockTransfer9<false, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<false, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<false, 1>); break;
+        }
+
+        PopRegs(false);
+
+        if (allocOffset)
+            ADD(64, R(RSP), Imm8(allocOffset));
+
+        bool firstUserMode = true;
+        for (int reg : regs)
+        {
+            if (usermode && !regs[15] && reg >= 8 && reg < 15)
+            {
+                if (firstUserMode)
+                {
+                    MOV(32, R(RSCRATCH), R(RCPSR));
+                    AND(32, R(RSCRATCH), Imm8(0x1F));
+                    firstUserMode = false;
+                }
+                MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                POP(RSCRATCH3);
+                CALL(WriteBanked);
+                FixupBranch sucessfulWritten = J_CC(CC_NC);
+                if (RegCache.LoadedRegs & (1 << reg))
+                    MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3));
+                else
+                    SaveReg(reg, RSCRATCH3);
+                SetJumpTarget(sucessfulWritten);
+            }
+            else if (!(RegCache.LoadedRegs & (1 << reg)))
+            {
+                assert(reg != 15);
+
+                POP(RSCRATCH);
+                SaveReg(reg, RSCRATCH);
+            }
+            else
+            {
+                POP(MapReg(reg).GetSimpleReg());
+            }
+        }
+    }
+    else
+    {
+        bool firstUserMode = true;
+        for (int reg = 15; reg >= 0; reg--)
+        {
+            if (regs[reg])
+            {
+                if (usermode && reg >= 8 && reg < 15)
+                {
+                    if (firstUserMode)
+                    {
+                        MOV(32, R(RSCRATCH), R(RCPSR));
+                        AND(32, R(RSCRATCH), Imm8(0x1F));
+                        firstUserMode = false;
+                    }
+                    if (RegCache.Mapping[reg] == INVALID_REG)
+                        LoadReg(reg, RSCRATCH3);
+                    else
+                        MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg]));
+                    MOV(32, R(RSCRATCH2), Imm32(reg - 8));
+                    CALL(ReadBanked);
+                    PUSH(RSCRATCH3);
+                }
+                else if (!(RegCache.LoadedRegs & (1 << reg)))
+                {
+                    LoadReg(reg, RSCRATCH);
+                    PUSH(RSCRATCH);
+                }
+                else
+                {
+                    PUSH(MapReg(reg).GetSimpleReg());
+                }
+            }
+        }
+
+        if (allocOffset)
+            SUB(64, R(RSP), Imm8(allocOffset));
+
+        PushRegs(false);
+
+        MOV(32, R(ABI_PARAM1), R(RSCRATCH4));
+        if (allocOffset)
+            LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset));
+        else
+            MOV(64, R(ABI_PARAM2), R(RSP));
+        
+        MOV(32, R(ABI_PARAM3), Imm32(regsCount));
+        if (Num == 0)
+            MOV(64, R(ABI_PARAM4), R(RCPU));
+
+        switch (Num * 2 | NDS::ConsoleType)
+        {
+        case 0: CALL((void*)&SlowBlockTransfer9<true, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<true, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, 1>); break;
+        }
+
+        ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
+    
+        PopRegs(false);
+    }
+
+    if (compileFastPath)
+    {
+        RET();
+        SwitchToNearCode();
+    }
+
+    if (!store && regs[15])
+    {
+        if (Num == 1)
+        {
+            if (Thumb)
+                OR(32, MapReg(15), Imm8(1));
+            else
+                AND(32, MapReg(15), Imm8(0xFE));
+        }
+        Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
+    }
+
+    return offset;
+}
+
+
+void Compiler::A_Comp_MemWB()
+{
+    bool load = CurInstr.Instr & (1 << 20);
+    bool byte = CurInstr.Instr & (1 << 22);
+    int size = byte ? 8 : 32;
+    
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+
+    Op2 offset;
+    if (!(CurInstr.Instr & (1 << 25)))
+    {
+        offset = Op2(CurInstr.Instr & 0xFFF);
+    }
+    else
+    {
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        int rm = CurInstr.A_Reg(0);
+
+        offset = Op2(rm, op, amount);
+    }
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::A_Comp_MemHalf()
+{
+    Op2 offset = CurInstr.Instr & (1 << 22)
+        ? Op2(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : Op2(CurInstr.A_Reg(0), 0, 0);
+
+    int op = (CurInstr.Instr >> 5) & 0x3;
+    bool load = CurInstr.Instr & (1 << 20);
+
+    bool signExtend = false;
+    int size;
+    if (!load)
+    {
+        size = op == 1 ? 16 : 32;
+        load = op == 2;
+    }
+    else if (load)
+    {
+        size = op == 2 ? 8 : 16;
+        signExtend = op > 1;
+    }
+
+    if (size == 32 && Num == 1)
+        return; // NOP
+
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::T_Comp_MemReg()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op & 0x2;
+    bool byte = op & 0x1;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::A_Comp_LDM_STM()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+    bool load = CurInstr.Instr & (1 << 20);
+    bool pre = CurInstr.Instr & (1 << 24);
+    bool add = CurInstr.Instr & (1 << 23);
+    bool writeback = CurInstr.Instr & (1 << 21);
+    bool usermode = CurInstr.Instr & (1 << 22);
+
+    OpArg rn = MapReg(CurInstr.A_Reg(16));
+
+    s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
+
+    if (load && writeback && regs[CurInstr.A_Reg(16)])
+        writeback = Num == 0
+            ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+            : false;
+    if (writeback)
+        ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
+}
+
+void Compiler::T_Comp_MemImm()
+{
+    int op = (CurInstr.Instr >> 11) & 0x3;
+    bool load = op & 0x1;
+    bool byte = op & 0x2;
+    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset),
+        byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+    int op = (CurInstr.Instr >> 10) & 0x3;
+    bool load = op != 0;
+    int size = op != 1 ? 16 : 8;
+    bool signExtend = op & 1;
+
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0),
+        size, flags);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
+        load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_LoadPCRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    u32 addr = (R15 & ~0x2) + offset;
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+    u32 offset = (CurInstr.Instr & 0xFF) * 4;
+    bool load = CurInstr.Instr & (1 << 11);
+
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32,
+        load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+    bool load = CurInstr.Instr & (1 << 11);
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    if (CurInstr.Instr & (1 << 8))
+    {
+        if (load)
+            regs[15] = true;
+        else
+            regs[14] = true;
+    }
+
+    OpArg sp = MapReg(13);
+    s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
+
+    ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+    BitSet16 regs(CurInstr.Instr & 0xFF);
+    OpArg rb = MapReg(CurInstr.T_Reg(8));
+    bool load = CurInstr.Instr & (1 << 11);
+
+    s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
+
+    if (!load || !regs[CurInstr.T_Reg(8)])
+        ADD(32, rb, Imm8(offset));
+}
+
+}
+\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
new file mode 100644
index 0000000..a73dd59
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -0,0 +1,3 @@
+#define ARM_CPSR_offset 0x64
+#define ARM_Cycles_offset 0xc
+#define ARM_StopExecution_offset 0x10
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
new file mode 100644
index 0000000..ccec951
--- /dev/null
+++ b/src/ARM_InstrInfo.cpp
@@ -0,0 +1,539 @@
+#include "ARM_InstrInfo.h"
+
+#include <stdio.h>
+
+#include "Config.h"
+
+namespace ARMInstrInfo
+{
+
+#define ak(x) ((x) << 22)
+
+enum {
+    A_Read0             = 1 << 0,
+    A_Read16            = 1 << 1,
+    A_Read8             = 1 << 2,
+    A_Read12            = 1 << 3,
+
+    A_Write12           = 1 << 4,
+    A_Write16           = 1 << 5,
+    A_MemWriteback      = 1 << 6,
+
+    A_BranchAlways      = 1 << 7,
+
+    // for STRD/LDRD
+    A_Read12Double      = 1 << 8,
+    A_Write12Double     = 1 << 9,
+
+    A_Link              = 1 << 10,
+
+    A_UnkOnARM7         = 1 << 11,
+
+    A_SetNZ             = 1 << 12,
+    A_SetCV             = 1 << 13,
+    A_SetMaybeC         = 1 << 14,
+    A_MulFlags          = 1 << 15,
+    A_ReadC             = 1 << 16,
+    A_RRXReadC          = 1 << 17,
+    A_StaticShiftSetC   = 1 << 18,
+    A_SetC              = 1 << 19,
+
+    A_WriteMem          = 1 << 20,
+    A_LoadMem           = 1 << 21
+};
+
+#define A_BIOP A_Read16
+#define A_MONOOP 0
+
+#define A_ARITH_LSL_IMM A_SetCV
+#define A_LOGIC_LSL_IMM A_StaticShiftSetC
+#define A_ARITH_SHIFT_IMM A_SetCV
+#define A_LOGIC_SHIFT_IMM A_SetC
+#define A_ARITH_SHIFT_REG A_SetCV
+#define A_LOGIC_SHIFT_REG A_SetMaybeC
+#define A_ARITH_IMM A_SetCV
+#define A_LOGIC_IMM 0
+
+#define A_IMPLEMENT_ALU_OP(x,k,a,c) \
+    const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \
+    \
+    const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \
+    const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a##_LSL_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \
+    const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \
+    const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \
+    const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \
+    const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \
+    const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \
+    const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \
+    const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S);
+
+A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0)
+A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC)
+A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0)
+A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0)
+
+const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM;
+
+#define A_IMPLEMENT_ALU_TEST(x,a) \
+    const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a##_IMM | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a##_LSL_IMM | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \
+    const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \
+    const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \
+    const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \
+    const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \
+    const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \
+    const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \
+    const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG);
+
+A_IMPLEMENT_ALU_TEST(TST,LOGIC)
+A_IMPLEMENT_ALU_TEST(TEQ,LOGIC)
+A_IMPLEMENT_ALU_TEST(CMP,ARITH)
+A_IMPLEMENT_ALU_TEST(CMN,ARITH)
+
+const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL);
+const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA);
+const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL);
+const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
+const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
+const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
+const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
+const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
+const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
+const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy);
+
+const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ);
+
+const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD);
+const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB);
+const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
+const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
+
+#define A_LDR A_Write12 | A_LoadMem
+#define A_STR A_Read12 | A_WriteMem
+
+#define A_IMPLEMENT_WB_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \
+    const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \
+    const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \
+    const u32 A_##x##_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \
+    \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \
+    const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \
+    const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \
+    const u32 A_##x##_POST_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR);
+
+A_IMPLEMENT_WB_LDRSTR(STR,STR)
+A_IMPLEMENT_WB_LDRSTR(STRB,STR)
+A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
+A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
+
+#define A_LDRD A_Write12Double | A_LoadMem
+#define A_STRD A_Read12Double | A_WriteMem
+
+#define A_IMPLEMENT_HD_LDRSTR(x,k) \
+    const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
+    const u32 A_##x##_REG = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG); \
+    const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \
+    const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG);
+
+A_IMPLEMENT_HD_LDRSTR(STRH,STR)
+A_IMPLEMENT_HD_LDRSTR(LDRD,LDRD)
+A_IMPLEMENT_HD_LDRSTR(STRD,STRD)
+A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
+A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
+
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWPB);
+
+const u32 A_LDM = A_Read16 | A_MemWriteback | A_LoadMem | ak(ak_LDM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
+
+const u32 A_B = A_BranchAlways | ak(ak_B);
+const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
+const u32 A_BLX_IMM = A_BranchAlways | A_Link | ak(ak_BLX_IMM);
+const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX);
+const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG);
+
+const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK);
+const u32 A_MSR_IMM = ak(ak_MSR_IMM);
+const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG);
+const u32 A_MRS = A_Write12 | ak(ak_MRS);
+const u32 A_MCR = A_Read12 | ak(ak_MCR);
+const u32 A_MRC = A_Write12 | ak(ak_MRC);
+const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
+
+// THUMB
+
+#define tk(x) ((x) << 22)
+
+enum {
+    T_Read0         = 1 << 0,
+    T_Read3         = 1 << 1,
+    T_Read6         = 1 << 2,
+    T_Read8         = 1 << 3,
+
+    T_Write0        = 1 << 4,
+    T_Write8        = 1 << 5,
+
+    T_ReadHi0       = 1 << 6,
+    T_ReadHi3       = 1 << 7,
+    T_WriteHi0      = 1 << 8,
+
+    T_ReadR13       = 1 << 9,
+    T_WriteR13      = 1 << 10,
+
+    T_BranchAlways  = 1 << 12,
+    T_ReadR14       = 1 << 13,
+    T_WriteR14      = 1 << 14,
+
+    T_SetNZ         = 1 << 15,
+    T_SetCV         = 1 << 16,
+    T_SetMaybeC     = 1 << 17,
+    T_ReadC         = 1 << 18,
+    T_SetC          = 1 << 19,
+    
+    T_WriteMem      = 1 << 20,
+    T_LoadMem       = 1 << 21,
+};
+
+const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
+const u32 T_LSR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_LSR_IMM);
+const u32 T_ASR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_ASR_IMM);
+
+const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_);
+const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_);
+const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_);
+const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_);
+
+const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM);
+const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Read8 | tk(tk_CMP_IMM);
+const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM);
+const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM);
+
+const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG);
+const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG);
+const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG);
+const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG);
+const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG);
+const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG);
+const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG);
+const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG);
+const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG);
+const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG);
+const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG);
+const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG);
+const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG);
+const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG);
+const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG);
+const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG);
+
+const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG);
+const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG);
+const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG);
+
+const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
+const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
+const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
+
+const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL);
+
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG);
+
+const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM);
+
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL);
+
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
+const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP);
+
+const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
+
+const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
+const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
+const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG);
+const u32 T_B = T_BranchAlways | tk(tk_B);
+const u32 T_BL_LONG_1 = T_WriteR14 | tk(tk_BL_LONG_1);
+const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | tk(tk_BL_LONG_2);
+
+const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK);
+const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC);
+
+#define INSTRFUNC_PROTO(x) u32 x
+#include "ARM_InstrTable.h"
+#undef INSTRFUNC_PROTO
+
+Info Decode(bool thumb, u32 num, u32 instr)
+{
+    const u8 FlagsReadPerCond[7] = {
+        flag_Z,
+        flag_C,
+        flag_N,
+        flag_V,
+        flag_C | flag_Z,
+        flag_N | flag_V,
+        flag_Z | flag_N | flag_V};
+
+    Info res = {0};
+    if (thumb)
+    {
+        u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
+        res.Kind = (data >> 22) & 0x3F;
+
+        if (data & T_Read0)
+            res.SrcRegs |= 1 << (instr & 0x7);
+        if (data & T_Read3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0x7);
+        if (data & T_Read6)
+            res.SrcRegs |= 1 << ((instr >> 6) & 0x7);
+        if (data & T_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0x7);
+
+        if (data & T_Write0)
+            res.DstRegs |= 1 << (instr & 0x7);
+        if (data & T_Write8)
+            res.DstRegs |= 1 << ((instr >> 8) & 0x7);
+        
+        if (data & T_ReadHi0)
+            res.SrcRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+        if (data & T_ReadHi3)
+            res.SrcRegs |= 1 << ((instr >> 3) & 0xF);
+        if (data & T_WriteHi0)
+            res.DstRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8));
+
+        if (data & T_ReadR13)
+            res.SrcRegs |= (1 << 13);
+        if (data & T_WriteR13)
+            res.DstRegs |= (1 << 13);
+        if (data & T_WriteR14)
+            res.DstRegs |= (1 << 14);
+        if (data & T_ReadR14)
+            res.SrcRegs |= (1 << 14);
+
+        if (data & T_BranchAlways)
+            res.DstRegs |= (1 << 15);
+
+        if (res.Kind == tk_POP && instr & (1 << 8))
+            res.DstRegs |= 1 << 15;
+
+        if (data & T_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & T_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & T_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if (data & T_ReadC)
+            res.ReadFlags |= flag_C;
+        if (data & T_SetC)
+            res.WriteFlags |= flag_C;
+
+        if (data & T_WriteMem)
+            res.SpecialKind = special_WriteMem;
+        
+        if (data & T_LoadMem)
+        {
+            if (res.Kind == tk_LDR_PCREL)
+            {
+                if (!Config::JIT_LiteralOptimisations)
+                    res.SrcRegs |= 1 << 15;
+                res.SpecialKind = special_LoadLiteral;
+            }
+            else
+            {
+                res.SpecialKind = special_LoadMem;
+            }
+        }
+
+        if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
+        {
+            u32 set = (instr & 0xFF);
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
+            res.DstRegs |= set;
+        }
+        if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
+        {
+            u32 set = (instr & 0xFF);
+            if (res.Kind == tk_PUSH && instr & (1 << 8))
+                set |= (1 << 14);
+            res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs);
+            res.SrcRegs |= set;
+        }
+
+        res.EndBlock |= res.Branches();
+
+        if (res.Kind == tk_BCOND)
+            res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7];
+
+        return res;
+    }
+    else
+    {
+        u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)];
+        if (num == 0 && (instr & 0xFE000000) == 0xFA000000)
+            data = A_BLX_IMM;
+        else if ((instr >> 28) == 0xF)
+            data = ak(ak_Nop);
+
+        if (data & A_UnkOnARM7 && num == 1)
+            data = A_UNK;
+
+        res.Kind = (data >> 22) & 0x1FF;
+
+        if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1)
+        {
+            data = ak(ak_Nop);
+            res.Kind = ak_Nop;
+        }
+
+        if (res.Kind == ak_MCR)
+        {
+            u32 cn = (instr >> 16) & 0xF;
+            u32 cm = instr & 0xF;
+            u32 cpinfo = (instr >> 5) & 0x7;
+            u32 id = (cn<<8)|(cm<<4)|cpinfo;
+            if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
+                res.EndBlock |= true;
+
+            if (id == 0x704 || id == 0x782)
+                res.SpecialKind = special_WaitForInterrupt;
+        }
+        if (res.Kind == ak_MCR || res.Kind == ak_MRC)
+        {
+            u32 cp = ((instr >> 8) & 0xF);
+            if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
+            {
+                data = A_UNK;
+                res.Kind = ak_UNK;
+            }
+        }
+        if (res.Kind == ak_MRS && !(instr & (1 << 22)))
+            res.ReadFlags |= flag_N | flag_Z | flag_C | flag_V;
+        if ((res.Kind == ak_MSR_IMM || res.Kind == ak_MSR_REG) && instr & (1 << 19))
+            res.WriteFlags |= flag_N | flag_Z | flag_C | flag_V;
+
+        if (data & A_Read0)
+            res.SrcRegs |= 1 << (instr & 0xF);
+        if (data & A_Read16)
+            res.SrcRegs |= 1 << ((instr >> 16) & 0xF);
+        if (data & A_Read8)
+            res.SrcRegs |= 1 << ((instr >> 8) & 0xF);
+        if (data & A_Read12)
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+        
+        if (data & A_Write12)
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+        if (data & A_Write16)
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+        
+        if (data & A_MemWriteback && instr & (1 << 21))
+            res.DstRegs |= 1 << ((instr >> 16) & 0xF);
+
+        if (data & A_BranchAlways)
+            res.DstRegs |= 1 << 15;
+        
+        if (data & A_Read12Double)
+        {
+            res.SrcRegs |= 1 << ((instr >> 12) & 0xF);
+            res.SrcRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+        if (data & A_Write12Double)
+        {
+            res.DstRegs |= 1 << ((instr >> 12) & 0xF);
+            res.DstRegs |= 1 << (((instr >> 12) & 0xF) + 1);
+        }
+
+        if (data & A_Link)
+            res.DstRegs |= 1 << 14;
+
+        if (res.Kind == ak_LDM)
+            res.DstRegs |= instr & (1 << 15); // this is right
+
+        if (res.Kind == ak_STM)
+            res.SrcRegs |= instr & (1 << 15);
+
+        if (data & A_SetNZ)
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_SetCV)
+            res.WriteFlags |= flag_C | flag_V;
+        if (data & A_SetMaybeC)
+            res.WriteFlags |= flag_C << 4;
+        if ((data & A_MulFlags) && (instr & (1 << 20)))
+            res.WriteFlags |= flag_N | flag_Z;
+        if (data & A_ReadC)
+            res.ReadFlags |= flag_C;
+        if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F))
+            res.ReadFlags |= flag_C;
+        if ((data & A_SetC) || ((data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)))
+            res.WriteFlags |= flag_C;
+
+        if (data & A_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
+        if (data & A_LoadMem)
+        {
+            if (res.SrcRegs == (1 << 15))
+               res.SpecialKind = special_LoadLiteral;
+            else
+                res.SpecialKind = special_LoadMem;
+        }
+        
+        if (res.Kind == ak_LDM)
+        {
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.DstRegs |= set;
+        }
+        if (res.Kind == ak_STM)
+        {
+            u16 set = (instr & 0xFFFF);
+            res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.SrcRegs |= set;
+        }
+
+        if ((instr >> 28) < 0xE)
+        {
+            // make non conditional flag sets conditional
+            res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0;
+            res.ReadFlags |= FlagsReadPerCond[instr >> 29];
+        }
+
+        res.EndBlock |= res.Branches();
+
+        return res;
+    }
+}
+
+}
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
new file mode 100644
index 0000000..a702435
--- /dev/null
+++ b/src/ARM_InstrInfo.h
@@ -0,0 +1,263 @@
+#ifndef ARMINSTRINFO_H
+#define ARMINSTRINFO_H
+
+#include "types.h"
+
+namespace ARMInstrInfo
+{
+
+// Instruction kinds, for faster dispatch
+
+#define ak_ALU(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_REG_LSL_IMM_S, \
+    ak_##n##_REG_LSR_IMM_S, \
+    ak_##n##_REG_ASR_IMM_S, \
+    ak_##n##_REG_ROR_IMM_S, \
+    \
+    ak_##n##_REG_LSL_REG_S, \
+    ak_##n##_REG_LSR_REG_S, \
+    ak_##n##_REG_ASR_REG_S, \
+    ak_##n##_REG_ROR_REG_S, \
+    \
+    ak_##n##_IMM_S \
+
+#define ak_Test(n) \
+    ak_##n##_REG_LSL_IMM, \
+    ak_##n##_REG_LSR_IMM, \
+    ak_##n##_REG_ASR_IMM, \
+    ak_##n##_REG_ROR_IMM, \
+    \
+    ak_##n##_REG_LSL_REG, \
+    ak_##n##_REG_LSR_REG, \
+    ak_##n##_REG_ASR_REG, \
+    ak_##n##_REG_ROR_REG, \
+    \
+    ak_##n##_IMM
+
+#define ak_WB_LDRSTR(n) \
+    ak_##n##_REG_LSL, \
+    ak_##n##_REG_LSR, \
+    ak_##n##_REG_ASR, \
+    ak_##n##_REG_ROR, \
+    \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG_LSL, \
+    ak_##n##_POST_REG_LSR, \
+    ak_##n##_POST_REG_ASR, \
+    ak_##n##_POST_REG_ROR, \
+    \
+    ak_##n##_POST_IMM
+
+#define ak_HD_LDRSTR(n) \
+    ak_##n##_REG, \
+    ak_##n##_IMM, \
+    \
+    ak_##n##_POST_REG, \
+    ak_##n##_POST_IMM
+
+enum
+{
+    ak_ALU(AND),
+    ak_ALU(EOR),
+    ak_ALU(SUB),
+    ak_ALU(RSB),
+    ak_ALU(ADD),
+    ak_ALU(ADC),
+    ak_ALU(SBC),
+    ak_ALU(RSC),
+    ak_ALU(ORR),
+    ak_ALU(MOV),
+    ak_ALU(BIC),
+    ak_ALU(MVN),
+
+    ak_Test(TST),
+    ak_Test(TEQ),
+    ak_Test(CMP),
+    ak_Test(CMN),
+
+    ak_MUL,
+    ak_MLA,
+    ak_UMULL,
+    ak_UMLAL,
+    ak_SMULL,
+    ak_SMLAL,
+    ak_SMLAxy,
+    ak_SMLAWy,
+    ak_SMULWy,
+    ak_SMLALxy,
+    ak_SMULxy,
+
+    ak_CLZ,
+
+    ak_QADD,
+    ak_QSUB,
+    ak_QDADD,
+    ak_QDSUB,
+
+    ak_WB_LDRSTR(STR),
+    ak_WB_LDRSTR(STRB),
+    ak_WB_LDRSTR(LDR),
+    ak_WB_LDRSTR(LDRB),
+
+    ak_HD_LDRSTR(STRH),
+    ak_HD_LDRSTR(LDRD),
+    ak_HD_LDRSTR(STRD),
+    ak_HD_LDRSTR(LDRH),
+    ak_HD_LDRSTR(LDRSB),
+    ak_HD_LDRSTR(LDRSH),
+
+    ak_SWP,
+    ak_SWPB,
+
+    ak_LDM,
+    ak_STM,
+
+    ak_B,
+    ak_BL,
+    ak_BLX_IMM,
+    ak_BX,
+    ak_BLX_REG,
+
+    ak_UNK,
+    ak_MSR_IMM,
+    ak_MSR_REG,
+    ak_MRS,
+    ak_MCR,
+    ak_MRC,
+    ak_SVC,
+
+    ak_Nop,
+
+    ak_Count,
+
+    tk_LSL_IMM = 0,
+    tk_LSR_IMM,
+    tk_ASR_IMM,
+
+    tk_ADD_REG_,
+    tk_SUB_REG_,
+    tk_ADD_IMM_,
+    tk_SUB_IMM_,
+
+    tk_MOV_IMM,
+    tk_CMP_IMM,
+    tk_ADD_IMM,
+    tk_SUB_IMM,
+
+    tk_AND_REG,
+    tk_EOR_REG,
+    tk_LSL_REG,
+    tk_LSR_REG,
+    tk_ASR_REG,
+    tk_ADC_REG,
+    tk_SBC_REG,
+    tk_ROR_REG,
+    tk_TST_REG,
+    tk_NEG_REG,
+    tk_CMP_REG,
+    tk_CMN_REG,
+    tk_ORR_REG,
+    tk_MUL_REG,
+    tk_BIC_REG,
+    tk_MVN_REG,
+
+    tk_ADD_HIREG,
+    tk_CMP_HIREG,
+    tk_MOV_HIREG,
+
+    tk_ADD_PCREL,
+    tk_ADD_SPREL,
+    tk_ADD_SP,
+
+    tk_LDR_PCREL,
+    tk_STR_REG,
+    tk_STRB_REG,
+    tk_LDR_REG,
+    tk_LDRB_REG,
+    tk_STRH_REG,
+    tk_LDRSB_REG,
+    tk_LDRH_REG,
+    tk_LDRSH_REG,
+    tk_STR_IMM,
+    tk_LDR_IMM,
+    tk_STRB_IMM,
+    tk_LDRB_IMM,
+    tk_STRH_IMM,
+    tk_LDRH_IMM,
+    tk_STR_SPREL,
+    tk_LDR_SPREL,
+
+    tk_PUSH,
+    tk_POP,
+    tk_LDMIA,
+    tk_STMIA,
+    
+    tk_BCOND,
+    tk_BX,
+    tk_BLX_REG,
+    tk_B,
+    tk_BL_LONG_1,
+    tk_BL_LONG_2,
+    tk_UNK,
+    tk_SVC,
+
+    // not a real instruction
+    tk_BL_LONG,
+
+    tk_Count
+};
+
+enum
+{
+    flag_N = 1 << 3,
+    flag_Z = 1 << 2,
+    flag_C = 1 << 1,
+    flag_V = 1 << 0,
+};
+
+enum
+{
+    special_NotSpecialAtAll = 0,
+    special_WriteMem,
+    special_LoadMem,
+    special_WaitForInterrupt,
+    special_LoadLiteral
+};
+
+struct Info
+{
+    u16 DstRegs, SrcRegs, NotStrictlyNeeded;
+    u16 Kind;
+
+    u8 SpecialKind;
+
+    u8 ReadFlags;
+    // lower 4 bits - set always
+    // upper 4 bits - might set flag
+    u8 WriteFlags;
+
+    bool EndBlock;
+    bool Branches() const
+    {
+        return DstRegs & (1 << 15);
+    }
+};
+
+Info Decode(bool thumb, u32 num, u32 instr);
+
+}
+
+#endif
+\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 32fcac2..84bbc2b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,7 @@
 project(core)
 
+set (CMAKE_CXX_STANDARD 14)
+
 add_library(core STATIC
 	ARCodeList.cpp
 	AREngine.cpp
@@ -44,10 +46,53 @@ add_library(core STATIC
 	version.h
 	Wifi.cpp
 	WifiAP.cpp
-	
+
 	tiny-AES-c/aes.c
+	xxhash/xxhash.c
 )
 
+if (ENABLE_JIT)
+	enable_language(ASM)
+
+	target_sources(core PRIVATE
+		ARM_InstrInfo.cpp
+
+		ARMJIT.cpp
+		ARMJIT_Memory.cpp
+
+		dolphin/CommonFuncs.cpp
+	)
+
+	if (ARCHITECTURE STREQUAL x86_64)
+		target_sources(core PRIVATE
+			dolphin/x64ABI.cpp
+			dolphin/x64CPUDetect.cpp
+			dolphin/x64Emitter.cpp
+
+			ARMJIT_x64/ARMJIT_Compiler.cpp
+			ARMJIT_x64/ARMJIT_ALU.cpp
+			ARMJIT_x64/ARMJIT_LoadStore.cpp
+			ARMJIT_x64/ARMJIT_Branch.cpp
+
+			ARMJIT_x64/ARMJIT_Linkage.s
+		)
+		set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+	endif()
+	if (ARCHITECTURE STREQUAL ARM64)
+		target_sources(core PRIVATE
+			dolphin/Arm64Emitter.cpp
+			dolphin/MathUtil.cpp
+
+			ARMJIT_A64/ARMJIT_Compiler.cpp
+			ARMJIT_A64/ARMJIT_ALU.cpp
+			ARMJIT_A64/ARMJIT_LoadStore.cpp
+			ARMJIT_A64/ARMJIT_Branch.cpp
+
+			ARMJIT_A64/ARMJIT_Linkage.s
+		)
+	endif()
+endif()
+
 if (WIN32)
 	target_link_libraries(core ole32 comctl32 ws2_32 opengl32)
 else()
diff --git a/src/CP15.cpp b/src/CP15.cpp
index d340b9e..992c83f 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -21,6 +21,8 @@
 #include "NDS.h"
 #include "DSi.h"
 #include "ARM.h"
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 
 // access timing for cached regions
@@ -41,8 +43,8 @@ void ARMv5::CP15Reset()
     DTCMSetting = 0;
     ITCMSetting = 0;
 
-    memset(ITCM, 0, 0x8000);
-    memset(DTCM, 0, 0x4000);
+    memset(ITCM, 0, ITCMPhysicalSize);
+    memset(DTCM, 0, DTCMPhysicalSize);
 
     ITCMSize = 0;
     DTCMBase = 0xFFFFFFFF;
@@ -74,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file)
     file->Var32(&DTCMSetting);
     file->Var32(&ITCMSetting);
 
-    file->VarArray(ITCM, 0x8000);
-    file->VarArray(DTCM, 0x4000);
+    file->VarArray(ITCM, ITCMPhysicalSize);
+    file->VarArray(DTCM, DTCMPhysicalSize);
 
     file->Var32(&PU_CodeCacheable);
     file->Var32(&PU_DataCacheable);
@@ -97,18 +99,26 @@ void ARMv5::CP15DoSavestate(Savestate* file)
 
 void ARMv5::UpdateDTCMSetting()
 {
+    u32 newDTCMBase;
+    u32 newDTCMSize;
     if (CP15Control & (1<<16))
     {
-        DTCMBase = DTCMSetting & 0xFFFFF000;
-        DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
+        newDTCMBase = DTCMSetting & 0xFFFFF000;
+        newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
         //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize);
     }
     else
     {
-        DTCMBase = 0xFFFFFFFF;
-        DTCMSize = 0;
+        newDTCMBase = 0xFFFFFFFF;
+        newDTCMSize = 0;
         //printf("DTCM disabled\n");
     }
+    if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize)
+    {
+        ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize);
+        DTCMBase = newDTCMBase;
+        DTCMSize = newDTCMSize;
+    }
 }
 
 void ARMv5::UpdateITCMSetting()
@@ -562,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
     case 0x750:
         ICacheInvalidateAll();
+        //Halt(255);
         return;
     case 0x751:
         ICacheInvalidateByAddr(val);
+        //Halt(255);
         return;
     case 0x752:
         printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val);
+        //Halt(255);
         return;
 
 
@@ -595,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val)
         ITCMSetting = val;
         UpdateITCMSetting();
         return;
+
+    case 0xF00:
+        //printf("cache debug index register %08X\n", val);
+        return;
+    
+    case 0xF10:
+        //printf("cache debug instruction tag %08X\n", val);
+        return;
+    
+    case 0xF20:
+        //printf("cache debug data tag %08X\n", val);
+        return;
+
+    case 0xF30:
+        //printf("cache debug instruction cache %08X\n", val);
+        return;
+
+    case 0xF40:
+        //printf("cache debug data cache %08X\n", val);
+        return;
+    
     }
 
     if ((id&0xF00)!=0x700)
@@ -704,7 +738,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
     if (addr < ITCMSize)
     {
         CodeCycles = 1;
-        return *(u32*)&ITCM[addr & 0x7FFF];
+        return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
     }
 
     CodeCycles = RegionCodeCycles;
@@ -726,16 +760,18 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
 
 void ARMv5::DataRead8(u32 addr, u32* val)
 {
+    DataRegion = addr;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u8*)&ITCM[addr & 0x7FFF];
+        *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -745,18 +781,20 @@ void ARMv5::DataRead8(u32 addr, u32* val)
 
 void ARMv5::DataRead16(u32 addr, u32* val)
 {
+    DataRegion = addr;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u16*)&ITCM[addr & 0x7FFF];
+        *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -766,18 +804,20 @@ void ARMv5::DataRead16(u32 addr, u32* val)
 
 void ARMv5::DataRead32(u32 addr, u32* val)
 {
+    DataRegion = addr;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -792,13 +832,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -808,16 +848,21 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
 
 void ARMv5::DataWrite8(u32 addr, u8 val)
 {
+    DataRegion = addr;
+
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u8*)&ITCM[addr & 0x7FFF] = val;
+        *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -827,18 +872,23 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
 
 void ARMv5::DataWrite16(u32 addr, u16 val)
 {
+    DataRegion = addr;
+
     addr &= ~1;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u16*)&ITCM[addr & 0x7FFF] = val;
+        *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -848,18 +898,23 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
 
 void ARMv5::DataWrite32(u32 addr, u32 val)
 {
+    DataRegion = addr;
+
     addr &= ~3;
 
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -874,13 +929,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
diff --git a/src/Config.cpp b/src/Config.cpp
index 5745f34..de1c70d 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,6 +37,14 @@ char DSiBIOS7Path[1024];
 char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
+int JIT_Enable = false;
+int JIT_MaxBlockSize = 32;
+int JIT_BranchOptimisations = 2;
+int JIT_LiteralOptimisations = true;
+int JIT_FastMemory = true;
+#endif
+
 ConfigEntry ConfigFile[] =
 {
     {"BIOS9Path", 1, BIOS9Path, 0, "", 1023},
@@ -48,6 +56,14 @@ ConfigEntry ConfigFile[] =
     {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023},
     {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023},
 
+#ifdef JIT_ENABLED
+    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0},
+    {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
+    {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0},
+#endif
+
     {"", -1, NULL, 0, NULL, 0}
 };
 
diff --git a/src/Config.h b/src/Config.h
index 3947598..5916b4a 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -51,6 +51,14 @@ extern char DSiBIOS7Path[1024];
 extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
+extern int JIT_Enable;
+extern int JIT_MaxBlockSize;
+extern int JIT_BranchOptimisations;
+extern int JIT_LiteralOptimisations;
+extern int JIT_FastMemory;
+#endif
+
 }
 
 #endif // CONFIG_H
diff --git a/src/DSi.cpp b/src/DSi.cpp
index 216f724..97a63cd 100644
--- a/src/DSi.cpp
+++ b/src/DSi.cpp
@@ -26,6 +26,11 @@
 #include "NDSCart.h"
 #include "Platform.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
+
 #include "DSi_NDMA.h"
 #include "DSi_I2C.h"
 #include "DSi_SD.h"
@@ -34,15 +39,6 @@
 #include "tiny-AES-c/aes.hpp"
 
 
-namespace NDS
-{
-
-extern ARMv5* ARM9;
-extern ARMv4* ARM7;
-
-}
-
-
 namespace DSi
 {
 
@@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000];
 
 u32 MBK[2][9];
 
-u8 NWRAM_A[0x40000];
-u8 NWRAM_B[0x40000];
-u8 NWRAM_C[0x40000];
+u8* NWRAM_A;
+u8* NWRAM_B;
+u8* NWRAM_C;
 
 u8* NWRAMMap_A[2][4];
 u8* NWRAMMap_B[3][8];
@@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00];
 
 bool Init()
 {
+#ifndef JIT_ENABLED
+    NWRAM_A = new u8[NWRAMSize];
+    NWRAM_B = new u8[NWRAMSize];
+    NWRAM_C = new u8[NWRAMSize];
+#endif
+
     if (!DSi_I2C::Init()) return false;
     if (!DSi_AES::Init()) return false;
 
@@ -106,6 +108,12 @@ bool Init()
 
 void DeInit()
 {
+#ifndef JIT_ENABLED
+    delete[] NWRAM_A;
+    delete[] NWRAM_B;
+    delete[] NWRAM_C;
+#endif
+
     DSi_I2C::DeInit();
     DSi_AES::DeInit();
 
@@ -176,7 +184,12 @@ void SoftReset()
     NDS::ARM9->Reset();
     NDS::ARM7->Reset();
 
+    NDS::ARM9->CP15Reset();
+
     memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000);
+#ifdef JIT_ENABLED
+    ARMJIT::CheckAndInvalidateITCM();
+#endif
 
     DSi_AES::Reset();
 
@@ -274,9 +287,9 @@ bool LoadNAND()
 {
     printf("Loading DSi NAND\n");
 
-    memset(NWRAM_A, 0, 0x40000);
-    memset(NWRAM_B, 0, 0x40000);
-    memset(NWRAM_C, 0, 0x40000);
+    memset(NWRAM_A, 0, NWRAMSize);
+    memset(NWRAM_B, 0, NWRAMSize);
+    memset(NWRAM_C, 0, NWRAMSize);
 
     memset(MBK, 0, sizeof(MBK));
     memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A));
@@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(0);
+
     int mbkn = 0, mbks = 8*num;
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(1);
+
     int mbkn = 1+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(2);
+
     int mbkn = 3+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val)
     u32 oldval = MBK[cpu][5+num];
     if (oldval == val) return;
 
+    ARMJIT_Memory::RemapNWRAM(num);
+
     MBK[cpu][5+num] = val;
 
     // TODO: what happens when the ranges are 'out of range'????
@@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write8(addr, val);
@@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write16(addr, val);
@@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write32(addr, val);
@@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+#endif
+            }
             return;
         }
         return NDS::ARM7Write8(addr, val);
@@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write16(addr, val);
@@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write32(addr, val);
@@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr)
     case 0x04004501: return DSi_I2C::Cnt;
 
     case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF;
-    case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
+    case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
     case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF;
     case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF;
     case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF;
diff --git a/src/DSi.h b/src/DSi.h
index 8cc8fd5..40f22bb 100644
--- a/src/DSi.h
+++ b/src/DSi.h
@@ -25,6 +25,8 @@
 namespace DSi
 {
 
+extern u16 SCFG_BIOS;
+
 extern u8 ARM9iBIOS[0x10000];
 extern u8 ARM7iBIOS[0x10000];
 
@@ -34,6 +36,19 @@ extern u64 ConsoleID;
 extern DSi_SDHost* SDMMC;
 extern DSi_SDHost* SDIO;
 
+const u32 NWRAMSize = 0x40000;
+
+extern u8* NWRAM_A;
+extern u8* NWRAM_B;
+extern u8* NWRAM_C;
+
+extern u8* NWRAMMap_A[2][4];
+extern u8* NWRAMMap_B[3][8];
+extern u8* NWRAMMap_C[3][8];
+
+extern u32 NWRAMStart[2][3];
+extern u32 NWRAMEnd[2][3];
+extern u32 NWRAMMask[2][3];
 
 bool Init();
 void DeInit();
diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp
index 9984f5e..e22c708 100644
--- a/src/DSi_I2C.cpp
+++ b/src/DSi_I2C.cpp
@@ -21,6 +21,7 @@
 #include "DSi.h"
 #include "DSi_I2C.h"
 #include "DSi_Camera.h"
+#include "ARM.h"
 
 
 namespace DSi_BPTWL
@@ -108,7 +109,8 @@ void Write(u8 val, bool last)
         printf("BPTWL: soft-reset\n");
         val = 0; // checkme
         // TODO: soft-reset might need to be scheduled later!
-        DSi::SoftReset();
+        // TODO: this has been moved for the JIT to work, nothing is confirmed here
+        NDS::ARM7->Halt(4);
         CurPos = -1;
         return;
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 22368ae..6981a42 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -33,6 +33,11 @@
 #include "AREngine.h"
 #include "Platform.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
+
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
 
@@ -93,17 +98,17 @@ u32 CPUStop;
 u8 ARM9BIOS[0x1000];
 u8 ARM7BIOS[0x4000];
 
-u8 MainRAM[0x1000000];
+u8* MainRAM;
 u32 MainRAMMask;
 
-u8 SharedWRAM[0x8000];
+u8* SharedWRAM;
 u8 WRAMCnt;
-u8* SWRAM_ARM9;
-u8* SWRAM_ARM7;
-u32 SWRAM_ARM9Mask;
-u32 SWRAM_ARM7Mask;
 
-u8 ARM7WRAM[0x10000];
+// putting them together so they're always next to each other
+MemRegion SWRAM_ARM9;
+MemRegion SWRAM_ARM7;
+
+u8* ARM7WRAM;
 
 u16 ExMemCnt[2];
 
@@ -168,6 +173,14 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+#ifdef JIT_ENABLED
+    ARMJIT::Init();
+#else
+    MainRAM = new u8[0x1000000];
+    ARM7WRAM = new u8[ARM7WRAMSize];
+    SharedWRAM = new u8[SharedWRAMSize];
+#endif
+
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
     DMAs[2] = new DMA(0, 2);
@@ -200,6 +213,10 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+#ifdef JIT_ENABLED
+    ARMJIT::DeInit();
+#endif
+
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
 
@@ -249,11 +266,9 @@ void SetARM9RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq,
         ARM9MemTimings[i][3] = S32;
     }
 
-    addrstart <<= 14;
-    addrend <<= 14;
-    if (!addrend) addrend = 0xFFFFFFFF;
-
-    ARM9->UpdateRegionTimings(addrstart, addrend);
+    ARM9->UpdateRegionTimings(addrstart<<14, addrend == 0x40000
+        ? 0xFFFFFFFF
+        : (addrend<<14));
 }
 
 void SetARM7RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq, int seq)
@@ -478,6 +493,10 @@ void Reset()
         printf("ARM7 BIOS loaded\n");
         fclose(f);
     }
+    
+#ifdef JIT_ENABLED
+    ARMJIT::Reset();
+#endif
 
     if (ConsoleType == 1)
     {
@@ -492,6 +511,10 @@ void Reset()
         ARM9ClockShift = 1;
         MainRAMMask = 0x3FFFFF;
     }
+    // has to be called before InitTimings
+    // otherwise some PU settings are completely
+    // unitialised on the first run
+    ARM9->CP15Reset();
 
     ARM9Timestamp = 0; ARM9Target = 0;
     ARM7Timestamp = 0; ARM7Target = 0;
@@ -499,7 +522,7 @@ void Reset()
 
     InitTimings();
 
-    memset(MainRAM, 0, 0x1000000);
+    memset(MainRAM, 0, MainRAMMask + 1);
     memset(SharedWRAM, 0, 0x8000);
     memset(ARM7WRAM, 0, 0x10000);
 
@@ -690,7 +713,7 @@ bool DoSavestate(Savestate* file)
 
     file->VarArray(MainRAM, 0x400000);
     file->VarArray(SharedWRAM, 0x8000);
-    file->VarArray(ARM7WRAM, 0x10000);
+    file->VarArray(ARM7WRAM, ARM7WRAMSize);
 
     file->VarArray(ExMemCnt, 2*sizeof(u16));
     file->VarArray(ROMSeed0, 2*8);
@@ -787,6 +810,13 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+#ifdef JIT_ENABLED
+    if (!file->Saving)
+    {
+        ARMJIT::ResetBlockCache();
+    }
+#endif
+
     return true;
 }
 
@@ -877,6 +907,7 @@ void RunSystem(u64 timestamp)
     }
 }
 
+template <bool EnableJIT>
 u32 RunFrame()
 {
     FrameStartTimestamp = SysTimestamp;
@@ -910,7 +941,12 @@ u32 RunFrame()
         }
         else
         {
-            ARM9->Execute();
+#ifdef JIT_ENABLED
+            if (EnableJIT)
+                ARM9->ExecuteJIT();
+            else
+#endif
+                ARM9->Execute();
         }
 
         RunTimers(0);
@@ -933,7 +969,12 @@ u32 RunFrame()
             }
             else
             {
-                ARM7->Execute();
+#ifdef JIT_ENABLED
+                if (EnableJIT)
+                    ARM7->ExecuteJIT();
+                else
+#endif
+                    ARM7->Execute();
             }
 
             RunTimers(1);
@@ -963,6 +1004,16 @@ u32 RunFrame()
     return GPU::TotalScanlines;
 }
 
+u32 RunFrame()
+{
+#ifdef JIT_ENABLED
+    if (Config::JIT_Enable)
+        return RunFrame<true>();
+    else
+#endif
+        return RunFrame<false>();
+}
+
 void Reschedule(u64 target)
 {
     if (CurCPU == 0)
@@ -1082,36 +1133,41 @@ void Halt()
 
 void MapSharedWRAM(u8 val)
 {
+    if (val == WRAMCnt)
+        return;
+
+    ARMJIT_Memory::RemapSWRAM();
+
     WRAMCnt = val;
 
     switch (WRAMCnt & 0x3)
     {
     case 0:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x7FFF;
-        SWRAM_ARM7 = NULL;
-        SWRAM_ARM7Mask = 0;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x7FFF;
+        SWRAM_ARM7.Mem = NULL;
+        SWRAM_ARM7.Mask = 0;
         break;
 
     case 1:
-        SWRAM_ARM9 = &SharedWRAM[0x4000];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 2:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0x4000];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 3:
-        SWRAM_ARM9 = NULL;
-        SWRAM_ARM9Mask = 0;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x7FFF;
+        SWRAM_ARM9.Mem = NULL;
+        SWRAM_ARM9.Mask = 0;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x7FFF;
         break;
     }
 }
@@ -1166,9 +1222,9 @@ void UpdateIRQ(u32 cpu)
 
     if (IME[cpu] & 0x1)
     {
-        arm->IRQ = IE[cpu] & IF[cpu];
+        arm->IRQ = !!(IE[cpu] & IF[cpu]);
         if ((ConsoleType == 1) && cpu)
-            arm->IRQ |= (IE2 & IF2);
+            arm->IRQ |= !!(IE2 & IF2);
     }
     else
     {
@@ -1787,9 +1843,9 @@ u8 ARM9Read8(u32 addr)
         return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1852,9 +1908,9 @@ u16 ARM9Read16(u32 addr)
         return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1917,9 +1973,9 @@ u32 ARM9Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1974,13 +2030,19 @@ void ARM9Write8(u32 addr, u8 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2024,13 +2086,19 @@ void ARM9Write16(u32 addr, u16 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2044,13 +2112,16 @@ void ARM9Write16(u32 addr, u16 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u16>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u16>(addr, val); return;
+        default: GPU::WriteVRAM_LCDC<u16>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2090,13 +2161,19 @@ void ARM9Write32(u32 addr, u32 val)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
         return ;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2110,13 +2187,16 @@ void ARM9Write32(u32 addr, u32 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:         GPU::WriteVRAM_LCDC<u32>(addr, val); return;
+        default: GPU::WriteVRAM_LCDC<u32>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2149,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val)
         return;
     }
 
-    printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
+    //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
 }
 
 bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
@@ -2162,10 +2242,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
         return true;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            region->Mem = SWRAM_ARM9;
-            region->Mask = SWRAM_ARM9Mask;
+            region->Mem = SWRAM_ARM9.Mem;
+            region->Mask = SWRAM_ARM9.Mask;
             return true;
         }
         break;
@@ -2204,17 +2284,17 @@ u8 ARM7Read8(u32 addr)
         return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead8(addr);
@@ -2264,17 +2344,17 @@ u16 ARM7Read16(u32 addr)
         return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead16(addr);
@@ -2331,17 +2411,17 @@ u32 ARM7Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead32(addr);
@@ -2385,23 +2465,35 @@ void ARM7Write8(u32 addr, u8 val)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
-            *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+            *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
-        *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+        *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2410,6 +2502,9 @@ void ARM7Write8(u32 addr, u8 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
+#endif
         GPU::WriteVRAM_ARM7<u8>(addr, val);
         return;
 
@@ -2444,23 +2539,35 @@ void ARM7Write16(u32 addr, u16 val)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
-            *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+            *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
-        *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+        *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2477,6 +2584,9 @@ void ARM7Write16(u32 addr, u16 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
+#endif
         GPU::WriteVRAM_ARM7<u16>(addr, val);
         return;
 
@@ -2513,23 +2623,35 @@ void ARM7Write32(u32 addr, u32 val)
     {
     case 0x02000000:
     case 0x02800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
+#endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
+#endif
+            *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
-            *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+            *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
-        *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
+#endif
+        *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2547,6 +2669,9 @@ void ARM7Write32(u32 addr, u32 val)
 
     case 0x06000000:
     case 0x06800000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
+#endif
         GPU::WriteVRAM_ARM7<u32>(addr, val);
         return;
 
@@ -2594,17 +2719,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region)
         // then access all the WRAM as one contiguous block starting at 0x037F8000
         // this case needs a bit of a hack to cover
         // it's not really worth bothering anyway
-        if (!SWRAM_ARM7)
+        if (!SWRAM_ARM7.Mem)
         {
             region->Mem = ARM7WRAM;
-            region->Mask = 0xFFFF;
+            region->Mask = ARM7WRAMSize-1;
             return true;
         }
         break;
 
     case 0x03800000:
         region->Mem = ARM7WRAM;
-        region->Mask = 0xFFFF;
+        region->Mask = ARM7WRAMSize-1;
         return true;
     }
 
diff --git a/src/NDS.h b/src/NDS.h
index 9c5fe3d..e0a5045 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -80,7 +80,7 @@ enum
     IRQ_IPCSendDone,
     IRQ_IPCRecv,
     IRQ_CartSendDone, // TODO: less misleading name
-    IRQ_CartIREQMC,   // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller)
+    IRQ_CartIREQMC,   // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller)
     IRQ_GXFIFO,
     IRQ_LidOpen,
     IRQ_SPI,
@@ -134,6 +134,7 @@ typedef struct
 } MemRegion;
 
 extern int ConsoleType;
+extern int CurCPU;
 
 extern u8 ARM9MemTimings[0x40000][4];
 extern u8 ARM7MemTimings[0x20000][4];
@@ -161,11 +162,22 @@ extern u8 ARM9BIOS[0x1000];
 extern u8 ARM7BIOS[0x4000];
 extern u16 ARM7BIOSProt;
 
-extern u8 MainRAM[0x1000000];
+extern u8* MainRAM;
 extern u32 MainRAMMask;
 
+const u32 MainRAMMaxSize = 0x1000000;
+
+const u32 SharedWRAMSize = 0x8000;
+extern u8* SharedWRAM;
+
+extern MemRegion SWRAM_ARM9;
+extern MemRegion SWRAM_ARM7;
+
 extern u32 KeyInput;
 
+const u32 ARM7WRAMSize = 0x10000;
+extern u8* ARM7WRAM;
+
 bool Init();
 void DeInit();
 void Reset();
diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h
new file mode 100644
index 0000000..40c4576
--- /dev/null
+++ b/src/dolphin/Align.h
@@ -0,0 +1,24 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace Common
+{
+template <typename T>
+constexpr T AlignUp(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value + (size - value % size) % size);
+}
+
+template <typename T>
+constexpr T AlignDown(T value, size_t size)
+{
+  static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+  return static_cast<T>(value - value % size);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
new file mode 100644
index 0000000..dd2416b
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -0,0 +1,4466 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#include "Compat.h"
+#include "Align.h"
+#include "Arm64Emitter.h"
+#include "BitUtils.h"
+#include "../types.h"
+#include "MathUtil.h"
+
+namespace Arm64Gen
+{
+namespace
+{
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width)
+{
+  // TODO(jbramley): Optimize this for ARM64 hosts.
+  int count = 0;
+  uint64_t bit_test = 1ULL << (width - 1);
+  while ((count < width) && ((bit_test & value) == 0))
+  {
+    count++;
+    bit_test >>= 1;
+  }
+  return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value)
+{
+  return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift)
+{
+  if (input < 4096)
+  {
+    *val = input;
+    *shift = false;
+    return true;
+  }
+  else if ((input & 0xFFF000) == input)
+  {
+    *val = input >> 12;
+    *shift = true;
+    return true;
+  }
+  return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+                  unsigned int* imm_r)
+{
+  // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+  // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+  bool negate = false;
+
+  // Logical immediates are encoded using parameters n, imm_s and imm_r using
+  // the following table:
+  //
+  //    N   imms    immr    size        S             R
+  //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+  //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+  //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+  //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+  //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+  //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+  // (s bits must not be all set)
+  //
+  // A pattern is constructed of size bits, where the least significant S+1 bits
+  // are set. The pattern is rotated right by R, and repeated across a 32 or
+  // 64-bit value, depending on destination register width.
+  //
+  // Put another way: the basic format of a logical immediate is a single
+  // contiguous stretch of 1 bits, repeated across the whole word at intervals
+  // given by a power of 2. To identify them quickly, we first locate the
+  // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+  // is different for every logical immediate, so it gives us all the
+  // information we need to identify the only logical immediate that our input
+  // could be, and then we simply check if that's the value we actually have.
+  //
+  // (The rotation parameter does give the possibility of the stretch of 1 bits
+  // going 'round the end' of the word. To deal with that, we observe that in
+  // any situation where that happens the bitwise NOT of the value is also a
+  // valid logical immediate. So we simply invert the input whenever its low bit
+  // is set, and then we know that the rotated case can't arise.)
+
+  if (value & 1)
+  {
+    // If the low bit is 1, negate the value, and set a flag to remember that we
+    // did (so that we can adjust the return values appropriately).
+    negate = true;
+    value = ~value;
+  }
+
+  if (width == kWRegSizeInBits)
+  {
+    // To handle 32-bit logical immediates, the very easiest thing is to repeat
+    // the input value twice to make a 64-bit word. The correct encoding of that
+    // as a logical immediate will also be the correct encoding of the 32-bit
+    // value.
+
+    // The most-significant 32 bits may not be zero (ie. negate is true) so
+    // shift the value left before duplicating it.
+    value <<= kWRegSizeInBits;
+    value |= value >> kWRegSizeInBits;
+  }
+
+  // The basic analysis idea: imagine our input word looks like this.
+  //
+  //    0011111000111110001111100011111000111110001111100011111000111110
+  //                                                          c  b    a
+  //                                                          |<--d-->|
+  //
+  // We find the lowest set bit (as an actual power-of-2 value, not its index)
+  // and call it a. Then we add a to our original number, which wipes out the
+  // bottommost stretch of set bits and replaces it with a 1 carried into the
+  // next zero bit. Then we look for the new lowest set bit, which is in
+  // position b, and subtract it, so now our number is just like the original
+  // but with the lowest stretch of set bits completely gone. Now we find the
+  // lowest set bit again, which is position c in the diagram above. Then we'll
+  // measure the distance d between bit positions a and c (using CLZ), and that
+  // tells us that the only valid logical immediate that could possibly be equal
+  // to this number is the one in which a stretch of bits running from a to just
+  // below b is replicated every d bits.
+  uint64_t a = LargestPowerOf2Divisor(value);
+  uint64_t value_plus_a = value + a;
+  uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+  uint64_t value_plus_a_minus_b = value_plus_a - b;
+  uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+  int d, clz_a, out_n;
+  uint64_t mask;
+
+  if (c != 0)
+  {
+    // The general case, in which there is more than one stretch of set bits.
+    // Compute the repeat distance d, and set up a bitmask covering the basic
+    // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+    // of these cases the N bit of the output will be zero.
+    clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+    int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+    d = clz_a - clz_c;
+    mask = ((UINT64_C(1) << d) - 1);
+    out_n = 0;
+  }
+  else
+  {
+    // Handle degenerate cases.
+    //
+    // If any of those 'find lowest set bit' operations didn't find a set bit at
+    // all, then the word will have been zero thereafter, so in particular the
+    // last lowest_set_bit operation will have returned zero. So we can test for
+    // all the special case conditions in one go by seeing if c is zero.
+    if (a == 0)
+    {
+      // The input was zero (or all 1 bits, which will come to here too after we
+      // inverted it at the start of the function), for which we just return
+      // false.
+      return false;
+    }
+    else
+    {
+      // Otherwise, if c was zero but a was not, then there's just one stretch
+      // of set bits in our word, meaning that we have the trivial case of
+      // d == 64 and only one 'repetition'. Set up all the same variables as in
+      // the general case above, and set the N bit in the output.
+      clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+      d = 64;
+      mask = ~UINT64_C(0);
+      out_n = 1;
+    }
+  }
+
+  // If the repeat period d is not a power of two, it can't be encoded.
+  if (!MathUtil::IsPow2<u64>(d))
+    return false;
+
+  // If the bit stretch (b - a) does not fit within the mask derived from the
+  // repeat period, then fail.
+  if (((b - a) & ~mask) != 0)
+    return false;
+
+  // The only possible option is b - a repeated every d bits. Now we're going to
+  // actually construct the valid logical immediate derived from that
+  // specification, and see if it equals our original input.
+  //
+  // To repeat a value every d bits, we multiply it by a number of the form
+  // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+  // be derived using a table lookup on CLZ(d).
+  static const std::array<uint64_t, 6> multipliers = {{
+      0x0000000000000001UL,
+      0x0000000100000001UL,
+      0x0001000100010001UL,
+      0x0101010101010101UL,
+      0x1111111111111111UL,
+      0x5555555555555555UL,
+  }};
+
+  int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+  // Ensure that the index to the multipliers array is within bounds.
+  DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+  uint64_t multiplier = multipliers[multiplier_idx];
+  uint64_t candidate = (b - a) * multiplier;
+
+  // The candidate pattern doesn't match our input value, so fail.
+  if (value != candidate)
+    return false;
+
+  // We have a match! This is a valid logical immediate, so now we have to
+  // construct the bits and pieces of the instruction encoding that generates
+  // it.
+
+  // Count the set bits in our basic stretch. The special case of clz(0) == -1
+  // makes the answer come out right for stretches that reach the very top of
+  // the word (e.g. numbers like 0xffffc00000000000).
+  int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+  int s = clz_a - clz_b;
+
+  // Decide how many bits to rotate right by, to put the low bit of that basic
+  // stretch in position a.
+  int r;
+  if (negate)
+  {
+    // If we inverted the input right at the start of this function, here's
+    // where we compensate: the number of set bits becomes the number of clear
+    // bits, and the rotation count is based on position b rather than position
+    // a (since b is the location of the 'lowest' 1 bit after inversion).
+    s = d - s;
+    r = (clz_b + 1) & (d - 1);
+  }
+  else
+  {
+    r = (clz_a + 1) & (d - 1);
+  }
+
+  // Now we're done, except for having to encode the S output in such a way that
+  // it gives both the number of set bits and the length of the repeated
+  // segment. The s field is encoded like this:
+  //
+  //     imms    size        S
+  //    ssssss    64    UInt(ssssss)
+  //    0sssss    32    UInt(sssss)
+  //    10ssss    16    UInt(ssss)
+  //    110sss     8    UInt(sss)
+  //    1110ss     4    UInt(ss)
+  //    11110s     2    UInt(s)
+  //
+  // So we 'or' (-d << 1) with our computed s to form imms.
+  *n = out_n;
+  *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+  *imm_r = r;
+
+  return true;
+}
+
+float FPImm8ToFloat(u8 bits)
+{
+  const u32 sign = bits >> 7;
+  const u32 bit6 = (bits >> 6) & 1;
+  const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+  const u32 mantissa = (bits & 0xF) << 19;
+  const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+  return Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out)
+{
+  const u32 f = Common::BitCast<u32>(value);
+  const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+  const u32 exponent = (f >> 23) & 0xFF;
+  const u32 sign = f >> 31;
+
+  if ((exponent >> 7) == ((exponent >> 6) & 1))
+    return false;
+
+  const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4;
+  const float new_float = FPImm8ToFloat(imm8);
+  if (new_float == value)
+    *imm_out = imm8;
+  else
+    return false;
+
+  return true;
+}
+}  // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr)
+{
+  m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr)
+{
+  SetCodePtrUnsafe(ptr);
+  m_lastCacheFlushEnd = ptr;
+}
+
+void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase)
+{
+  m_code = 0;
+  m_lastCacheFlushEnd = 0;
+  m_rwbase = rwbase;
+  m_rxbase = rxbase;
+}
+
+ptrdiff_t ARM64XEmitter::GetCodeOffset()
+{
+  return m_code;
+}
+
+const u8* ARM64XEmitter::GetRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+u8* ARM64XEmitter::GetWriteableRWPtr()
+{
+  return m_rwbase + m_code;
+}
+
+void* ARM64XEmitter::GetRXPtr()
+{
+  return m_rxbase + m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes)
+{
+  for (u32 i = 0; i < bytes / 4; i++)
+    BRK(0);
+}
+
+ptrdiff_t ARM64XEmitter::AlignCode16()
+{
+  int c = int((u64)m_code & 15);
+  if (c)
+    ReserveCodeSpace(16 - c);
+  return m_code;
+}
+
+ptrdiff_t ARM64XEmitter::AlignCodePage()
+{
+  int c = int((u64)m_code & 4095);
+  if (c)
+    ReserveCodeSpace(4096 - c);
+  return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value)
+{
+  std::memcpy(m_rwbase + m_code, &value, sizeof(u32));
+  m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache()
+{
+  FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code);
+  m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
+{
+  if (start == end)
+    return;
+
+#if defined(IOS)
+  // Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+  sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+  // Don't rely on GCC's __clear_cache implementation, as it caches
+  // icache/dcache cache line sizes, that can vary between cores on
+  // big.LITTLE architectures.
+  u64 addr, ctr_el0;
+  static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+  size_t isize, dsize;
+
+  __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+  isize = 4 << ((ctr_el0 >> 0) & 0xf);
+  dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+  // use the global minimum cache line size
+  icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+  dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+  addr = (u64)start & ~(u64)(dsize - 1);
+  for (; addr < (u64)end; addr += dsize)
+    // use "civac" instead of "cvau", as this is the suggested workaround for
+    // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+    __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+  __asm__ volatile("dsb ish" : : : "memory");
+
+  addr = (u64)start & ~(u64)(isize - 1);
+  for (; addr < (u64)end; addr += isize)
+    __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+  __asm__ volatile("dsb ish" : : : "memory");
+  __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+    {0, 0, 1},  // SVC
+    {0, 0, 2},  // HVC
+    {0, 0, 3},  // SMC
+    {1, 0, 0},  // BRK
+    {2, 0, 0},  // HLT
+    {5, 0, 1},  // DCPS1
+    {5, 0, 2},  // DCPS2
+    {5, 0, 3},  // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+    0x058,  // ADD
+    0x258,  // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+    {0, 0},  // CSEL
+    {0, 1},  // CSINC
+    {1, 0},  // CSINV
+    {1, 1},  // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+    {0, 0},  // RBIT
+    {0, 1},  // REV16
+    {0, 2},  // REV32
+    {0, 3},  // REV64
+    {0, 4},  // CLZ
+    {0, 5},  // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+    0x02,  // UDIV
+    0x03,  // SDIV
+    0x08,  // LSLV
+    0x09,  // LSRV
+    0x0A,  // ASRV
+    0x0B,  // RORV
+    0x10,  // CRC32B
+    0x11,  // CRC32H
+    0x12,  // CRC32W
+    0x14,  // CRC32CB
+    0x15,  // CRC32CH
+    0x16,  // CRC32CW
+    0x13,  // CRC32X (64bit Only)
+    0x17,  // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+    {0, 0},  // MADD
+    {0, 1},  // MSUB
+    {1, 0},  // SMADDL (64Bit Only)
+    {1, 1},  // SMSUBL (64Bit Only)
+    {2, 0},  // SMULH (64Bit Only)
+    {5, 0},  // UMADDL (64Bit Only)
+    {5, 1},  // UMSUBL (64Bit Only)
+    {6, 0},  // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+    {0, 0},  // AND
+    {0, 1},  // BIC
+    {1, 0},  // OOR
+    {1, 1},  // ORN
+    {2, 0},  // EOR
+    {2, 1},  // EON
+    {3, 0},  // ANDS
+    {3, 1},  // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+    {0, 0, 0, 0, 0},  // STXRB
+    {0, 0, 0, 0, 1},  // STLXRB
+    {0, 0, 1, 0, 0},  // LDXRB
+    {0, 0, 1, 0, 1},  // LDAXRB
+    {0, 1, 0, 0, 1},  // STLRB
+    {0, 1, 1, 0, 1},  // LDARB
+    {1, 0, 0, 0, 0},  // STXRH
+    {1, 0, 0, 0, 1},  // STLXRH
+    {1, 0, 1, 0, 0},  // LDXRH
+    {1, 0, 1, 0, 1},  // LDAXRH
+    {1, 1, 0, 0, 1},  // STLRH
+    {1, 1, 1, 0, 1},  // LDARH
+    {2, 0, 0, 0, 0},  // STXR
+    {3, 0, 0, 0, 0},  // (64bit) STXR
+    {2, 0, 0, 0, 1},  // STLXR
+    {3, 0, 0, 0, 1},  // (64bit) STLXR
+    {2, 0, 0, 1, 0},  // STXP
+    {3, 0, 0, 1, 0},  // (64bit) STXP
+    {2, 0, 0, 1, 1},  // STLXP
+    {3, 0, 0, 1, 1},  // (64bit) STLXP
+    {2, 0, 1, 0, 0},  // LDXR
+    {3, 0, 1, 0, 0},  // (64bit) LDXR
+    {2, 0, 1, 0, 1},  // LDAXR
+    {3, 0, 1, 0, 1},  // (64bit) LDAXR
+    {2, 0, 1, 1, 0},  // LDXP
+    {3, 0, 1, 1, 0},  // (64bit) LDXP
+    {2, 0, 1, 1, 1},  // LDAXP
+    {3, 0, 1, 1, 1},  // (64bit) LDAXP
+    {2, 1, 0, 0, 1},  // STLR
+    {3, 1, 0, 0, 1},  // (64bit) STLR
+    {2, 1, 1, 0, 1},  // LDAR
+    {3, 1, 1, 0, 1},  // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  bool b64Bit = Is64Bit(Rt);
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Rt = DecodeReg(Rt);
+  Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+          (((u32)distance << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr)
+{
+  s64 distance = (s64)ptr - s64(m_rxbase + m_code);
+
+  ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+             __func__, distance);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+             "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+  Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn)
+{
+  Rn = DecodeReg(Rn);
+  Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d",
+             __func__, imm);
+
+  Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+          ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt)
+{
+  Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                         ARM64Reg Rm, ArithOption Option)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+          (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+          Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                              ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rn);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+          (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+                                             CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rm);
+
+  ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+          (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                         CCFlags cond)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+          (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+          (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+          Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                       ARM64Reg Ra)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Ra = DecodeReg(Ra);
+  Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+          (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                      ArithOption Shift)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rm = DecodeReg(Rm);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+          (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  if (b64Bit && bitop != 0x2)  // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+    bitop |= 0x1;
+  Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+                                           ARM64Reg Rt)
+{
+  Rs = DecodeReg(Rs);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Rt = DecodeReg(Rt);
+  Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) |
+          (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) |
+          (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                                              u32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool b128Bit = IsQuad(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (b128Bit)
+    imm >>= 4;
+  else if (b64Bit)
+    imm >>= 3;
+  else
+    imm >>= 2;
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+  u32 opc = 0;
+  if (b128Bit)
+    opc = 2;
+  else if (b64Bit && bVec)
+    opc = 1;
+  else if (b64Bit && !bVec)
+    opc = 2;
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+  Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  u32 offset = imm & 0x1FF;
+
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size)
+{
+  bool b64Bit = Is64Bit(Rt);
+  bool bVec = IsVector(Rt);
+
+  if (size == 64)
+    imm >>= 3;
+  else if (size == 32)
+    imm >>= 2;
+  else if (size == 16)
+    imm >>= 1;
+
+  ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+          (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+                                                  ArithOption Rm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+          (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+                                        ARM64Reg Rd)
+{
+  bool b64Bit = Is64Bit(Rd);
+
+  ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+                                         int n)
+{
+  // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+  // Use Rn to determine bitness here.
+  bool b64Bit = Is64Bit(Rn);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                                        ARM64Reg Rn, s32 imm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  u32 type_encode = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (b64Bit)
+  {
+    op |= 0b10;
+    imm >>= 3;
+  }
+  else
+  {
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
+{
+  Rd = DecodeReg(Rd);
+
+  Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+static constexpr bool IsInRangeImm19(s64 distance)
+{
+  return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance)
+{
+  return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance)
+{
+  return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance)
+{
+  return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance)
+{
+  return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance)
+{
+  return distance & 0x3FFFFFF;
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
+{
+  bool Not = false;
+  u32 inst = 0;
+  s64 distance = (s64)(m_code - branch.ptr);
+  distance >>= 2;
+
+  switch (branch.type)
+  {
+  case 1:  // CBNZ
+    Not = true;
+  case 0:  // CBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    bool b64Bit = Is64Bit(branch.reg);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+  }
+  break;
+  case 2:  // B (conditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+    break;
+  case 4:  // TBNZ
+    Not = true;
+  case 3:  // TBZ
+  {
+    ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    ARM64Reg reg = DecodeReg(branch.reg);
+    inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) |
+           (MaskImm14(distance) << 5) | reg;
+  }
+  break;
+  case 5:  // B (uncoditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x5 << 26) | MaskImm26(distance);
+    break;
+  case 6:  // BL (unconditional)
+    ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+               __func__, branch.type, distance);
+    inst = (0x25 << 26) | MaskImm26(distance);
+    break;
+  }
+
+  std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 0;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 1;
+  branch.reg = Rt;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 2;
+  branch.cond = cond;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 3;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit)
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 4;
+  branch.reg = Rt;
+  branch.bit = bit;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::B()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 5;
+  HINT(HINT_NOP);
+  return branch;
+}
+FixupBranch ARM64XEmitter::BL()
+{
+  FixupBranch branch;
+  branch.ptr = m_code;
+  branch.type = 6;
+  HINT(HINT_NOP);
+  return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr)
+{
+  EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr)
+{
+  s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+  distance >>= 2;
+
+  ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance),
+             "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr,
+             distance, distance);
+  Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+  EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr)
+{
+  EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BLR(scratchreg);
+  }
+  else
+  {
+    BL(func);
+  }
+}
+
+void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func)
+{
+  s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+  distance >>= 2;  // Can only branch to opcode-aligned (4) addresses
+  if (!IsInRangeImm26(distance))
+  {
+    // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+    // func);
+    MOVI2R(scratchreg, (uintptr_t)func);
+    BR(scratchreg);
+  }
+  else
+  {
+    B(func);
+  }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn)
+{
+  EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET()
+{
+  EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS()
+{
+  EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm)
+{
+  EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm)
+{
+  EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm)
+{
+  EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm)
+{
+  EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm)
+{
+  EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm)
+{
+  EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm)
+{
+  EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm)
+{
+  EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm)
+{
+  u32 op1 = 0, op2 = 0;
+  switch (field)
+  {
+  case FIELD_SPSel:
+    op1 = 0;
+    op2 = 5;
+    break;
+  case FIELD_DAIFSet:
+    op1 = 3;
+    op2 = 6;
+    break;
+  case FIELD_DAIFClr:
+    op1 = 3;
+    op2 = 7;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to");
+    break;
+  }
+  EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2)
+{
+  switch (field)
+  {
+  case FIELD_NZCV:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 2;
+    op2 = 0;
+    break;
+  case FIELD_FPCR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 0;
+    break;
+  case FIELD_FPSR:
+    o0 = 3;
+    op1 = 3;
+    CRn = 4;
+    CRm = 4;
+    op2 = 1;
+    break;
+  case FIELD_PMCR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 6;
+    op2 = 0;
+    break;
+  case FIELD_PMCCNTR_EL0:
+    o0 = 3;
+    op1 = 3;
+    CRn = 9;
+    CRm = 7;
+    op2 = 0;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to");
+    break;
+  }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field)
+{
+  int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit");
+  GetSystemReg(field, o0, op1, CRn, CRm, op2);
+  EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+  // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+  EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op)
+{
+  EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX()
+{
+  EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type)
+{
+  EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+  EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+  EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+  EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift)
+{
+  ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm)
+{
+  if (IsGPR(Rd) && IsGPR(Rm))
+    ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+  else
+    ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm)
+{
+  ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  int bits = Is64Bit(Rd) ? 64 : 32;
+  SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+  EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+  EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift)
+{
+  EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+  EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+  EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+  u32 size = Is64Bit(Rn) ? 64 : 32;
+  ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+             "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+             lsb, width);
+  EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift)
+{
+  bool sf = Is64Bit(Rd);
+  bool N = sf;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+  SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+  UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
+{
+  EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn)
+{
+  EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+  EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+  else
+    EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                               imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  if (type == INDEX_UNSIGNED)
+    EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+  else
+    EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  bool b64Bit = Is64Bit(Rt);
+  EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+  EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (Rt, Rn, Rm); break;
+    case 33: LDRSW(Rt, Rn, Rm); break;
+    case 16: LDRH (Rt, Rn, Rm); break;
+    case 17: LDRSH(Rt, Rn, Rm); break;
+    case 8:  LDRB (Rt, Rn, Rm); break;
+    case 9:  LDRSB(Rt, Rn, Rm); break;
+    default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  switch (size)
+  {
+    case 32: STR  (Rt, Rn, Rm); break;
+    case 16: STRH (Rt, Rn, Rm); break;
+    case 8:  STRB (Rt, Rn, Rm); break;
+    default: PanicAlert("STRGeneric(reg): invalid size %d", size); break;
+  }
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size | signExtend)
+  {
+    case 32: LDR  (type, Rt, Rn, imm); break;
+    case 33: LDRSW(type, Rt, Rn, imm); break;
+    case 16: LDRH (type, Rt, Rn, imm); break;
+    case 17: LDRSH(type, Rt, Rn, imm); break;
+    case 8:  LDRB (type, Rt, Rn, imm); break;
+    case 9:  LDRSB(type, Rt, Rn, imm); break;
+    default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break;
+  }
+}
+void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  switch (size)
+  {
+    case 32: STR  (type, Rt, Rn, imm); break;
+    case 16: STRH (type, Rt, Rn, imm); break;
+    case 8:  STRB (type, Rt, Rn, imm); break;
+    default: PanicAlert("STRGeneric(imm): invalid size %d", size); break;
+  }
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm)
+{
+  EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
+{
+  unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+  BitSet32 upload_part(0);
+
+  // Always start with a movz! Kills the dependency on the register.
+  bool use_movz = true;
+
+  if (!imm)
+  {
+    // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
+    // clearer in disasm too.
+    MOVZ(Rd, 0, SHIFT_0);
+    return;
+  }
+
+  if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+      (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
+  {
+    // Max unsigned value (or if signed, -1)
+    // Set to ~ZR
+    ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+    ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+    return;
+  }
+
+  // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
+  // Small negative integer. Use MOVN
+  if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
+  {
+    MOVN(Rd, ~imm, SHIFT_0);
+    return;
+  }
+
+  // XXX: Use MOVN when possible.
+  // XXX: Optimize more
+  // XXX: Support rotating immediates to save instructions
+  if (optimize)
+  {
+    for (unsigned int i = 0; i < parts; ++i)
+    {
+      if ((imm >> (i * 16)) & 0xFFFF)
+        upload_part[i] = 1;
+    }
+  }
+
+  u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF;
+s64 aligned_offset = (s64)imm - (s64)aligned_pc;
+  // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
+  if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
+  {
+    // Immediate we are loading is within 4GB of our aligned range
+    // Most likely a address that we can load in one or two instructions
+    if (!(std::abs(aligned_offset) & 0xFFF))
+    {
+      // Aligned ADR
+      ADRP(Rd, (s32)aligned_offset);
+      return;
+    }
+    else
+    {
+      // If the address is within 1MB of PC we can load it in a single instruction still
+      s64 offset = (s64)imm - (s64)(m_rxbase + m_code);
+      if (offset >= -0xFFFFF && offset <= 0xFFFFF)
+      {
+        ADR(Rd, (s32)offset);
+        return;
+      }
+      else
+      {
+        ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
+        ADD(Rd, Rd, imm & 0xFFF);
+        return;
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < parts; ++i)
+  {
+    if (use_movz && upload_part[i])
+    {
+      MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+      use_movz = false;
+    }
+    else
+    {
+      if (upload_part[i] || !optimize)
+        MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+    }
+  }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
+{
+  // TODO: Also optimize for performance, not just for code size.
+  ptrdiff_t start_offset = GetCodeOffset();
+
+  MOVI2R(Rd, imm1);
+  int size1 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  MOVI2R(Rd, imm2);
+  int size2 = GetCodeOffset() - start_offset;
+
+  SetCodePtrUnsafe(start_offset);
+
+  bool element = size1 > size2;
+
+  MOVI2R(Rd, element ? imm2 : imm1);
+
+  return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last write to avoid the dependency between those stores.
+
+  // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+  if (num_regs & 1)
+  {
+    STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size);
+  }
+  else
+  {
+    ARM64Reg first_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg second_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size);
+  }
+
+  // Fast store for all other registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
+{
+  int num_regs = registers.Count();
+  int stack_size = (num_regs + (num_regs & 1)) * 8;
+  auto it = registers.begin();
+
+  if (!num_regs)
+    return;
+
+  // We must adjust the SP in the end, so load the first (two) registers at least.
+  ARM64Reg first = (ARM64Reg)(X0 + *it++);
+  ARM64Reg second;
+  if (!(num_regs & 1))
+    second = (ARM64Reg)(X0 + *it++);
+
+  // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+  // Only update the SP on the last load to avoid the dependency between those loads.
+
+  // Fast load for all but the first (two) registers, this is always an even number.
+  for (int i = 0; i < (num_regs - 1) / 2; i++)
+  {
+    ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+    ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+    LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+  }
+
+  // Post loading the first (two) registers.
+  if (num_regs & 1)
+    LDR(INDEX_POST, first, SP, stack_size);
+  else
+    LDP(INDEX_POST, first, second, SP, stack_size);
+
+  ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+                                               ARM64Reg Rn, s32 imm)
+{
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  u32 encoded_size = 0;
+  u32 encoded_imm = 0;
+
+  if (size == 8)
+    encoded_size = 0;
+  else if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+  else if (size == 128)
+    encoded_size = 0;
+
+  if (type == INDEX_UNSIGNED)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)),
+               "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__,
+               imm, m_emit->GetCodePtr());
+    ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!",
+               __func__);
+    if (size == 16)
+      imm >>= 1;
+    else if (size == 32)
+      imm >>= 2;
+    else if (size == 64)
+      imm >>= 3;
+    else if (size == 128)
+      imm >>= 4;
+    encoded_imm = (imm & 0xFFF);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255),
+               "%s immediate offset must be within range of -256 to 256!", __func__);
+    encoded_imm = (imm & 0x1FF) << 2;
+    if (type == INDEX_POST)
+      encoded_imm |= 1;
+    else
+      encoded_imm |= 3;
+  }
+
+  Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+          (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) |
+          (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                      ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rd);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+          (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+          (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+  bool quad = IsQuad(Rt);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+          (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+                                       ARM64Reg Rd, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__);
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+                                               bool sign)
+{
+  DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point");
+  if (IsGPR(Rd))
+  {
+    // Use the encoding that transfers the result to a GPR.
+    bool sf = Is64Bit(Rd);
+    int type = IsDouble(Rn) ? 1 : 0;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = (sign ? 1 : 0);
+    int rmode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      rmode = 0;
+      opcode |= 4;
+      break;
+    case ROUND_P:
+      rmode = 1;
+      break;
+    case ROUND_M:
+      rmode = 2;
+      break;
+    case ROUND_Z:
+      rmode = 3;
+      break;
+    case ROUND_N:
+      rmode = 0;
+      break;
+    }
+    EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+  }
+  else
+  {
+    // Use the encoding (vector, single) that keeps the result in the fp register.
+    int sz = IsDouble(Rn);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int opcode = 0;
+    switch (round)
+    {
+    case ROUND_A:
+      opcode = 0x1C;
+      break;
+    case ROUND_N:
+      opcode = 0x1A;
+      break;
+    case ROUND_M:
+      opcode = 0x1B;
+      break;
+    case ROUND_P:
+      opcode = 0x1A;
+      sz |= 2;
+      break;
+    case ROUND_Z:
+      opcode = 0x1B;
+      sz |= 2;
+      break;
+    }
+    Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+            (Rn << 5) | Rd);
+  }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+  EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+                                        u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) |
+          (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rn);
+
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+          (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+                                       ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+  bool is_double = IsDouble(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+          (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+  bool quad = IsQuad(Rd);
+
+  u32 encoded_size = 0;
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  bool is_double = !IsSingle(Rd);
+
+  Rd = DecodeReg(Rd);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+          (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                     ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                           ARM64Reg Rn)
+{
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+          (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+                                                       ARM64Reg Rn)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) |
+          Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+                                                           ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rt);
+  u32 encoded_size = 0;
+
+  if (size == 16)
+    encoded_size = 1;
+  else if (size == 32)
+    encoded_size = 2;
+  else if (size == 64)
+    encoded_size = 3;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+          (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+
+  Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+          (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+                                           ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool quad = IsQuad(Rd);
+
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+
+  Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+          (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+             imm);
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+
+  Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+                                            ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+  u32 type_encode = 0;
+  u32 opc = 0;
+
+  switch (type)
+  {
+  case INDEX_SIGNED:
+    type_encode = 0b010;
+    break;
+  case INDEX_POST:
+    type_encode = 0b001;
+    break;
+  case INDEX_PRE:
+    type_encode = 0b011;
+    break;
+  case INDEX_UNSIGNED:
+    ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+    break;
+  }
+
+  if (size == 128)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 2;
+    imm >>= 4;
+  }
+  else if (size == 64)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 1;
+    imm >>= 3;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+    opc = 0;
+    imm >>= 2;
+  }
+
+  Rt = DecodeReg(Rt);
+  Rt2 = DecodeReg(Rt2);
+  Rn = DecodeReg(Rn);
+
+  Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+          (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+                                                      ArithOption Rm)
+{
+  ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+             "%s must contain an extended reg as Rm!", __func__);
+
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  if (load)
+    encoded_op |= 1;
+
+  Rt = DecodeReg(Rt);
+  Rn = DecodeReg(Rn);
+  ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+  Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+          Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh)
+{
+  union
+  {
+    u8 hex;
+    struct
+    {
+      unsigned defgh : 5;
+      unsigned abc : 3;
+    };
+  } v;
+  v.hex = abcdefgh;
+  Rd = DecodeReg(Rd);
+  Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) |
+          (1 << 10) | (v.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 1;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 1;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 1;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 1;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 3;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+  u32 encoded_size = 0;
+  u32 encoded_op = 0;
+
+  if (size == 8)
+  {
+    encoded_size = 0;
+    encoded_op = 0;
+  }
+  else if (size == 16)
+  {
+    encoded_size = 1;
+    encoded_op = 0;
+  }
+  else if (size == 32)
+  {
+    encoded_size = 2;
+    encoded_op = 0;
+  }
+  else if (size == 64)
+  {
+    encoded_size = 3;
+    encoded_op = 0;
+  }
+  else if (size == 128)
+  {
+    encoded_size = 0;
+    encoded_op = 2;
+  }
+
+  EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+  bool S = 0;
+  u32 opcode = 0;
+  u32 encoded_size = 0;
+  ARM64Reg encoded_reg = INVALID_REG;
+
+  if (size == 8)
+  {
+    S = (index & 4) != 0;
+    opcode = 0;
+    encoded_size = index & 3;
+    if (index & 8)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 16)
+  {
+    S = (index & 2) != 0;
+    opcode = 2;
+    encoded_size = (index & 1) << 1;
+    if (index & 4)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 32)
+  {
+    S = (index & 1) != 0;
+    opcode = 4;
+    encoded_size = 0;
+    if (index & 2)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+  else if (size == 64)
+  {
+    S = 0;
+    opcode = 4;
+    encoded_size = 1;
+    if (index == 1)
+      encoded_reg = EncodeRegToQuad(Rt);
+    else
+      encoded_reg = EncodeRegToDouble(Rt);
+  }
+
+  EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm)
+{
+  ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+             __func__);
+  ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+  u32 opcode = 0;
+  if (count == 1)
+    opcode = 0b111;
+  else if (count == 2)
+    opcode = 0b1010;
+  else if (count == 3)
+    opcode = 0b0110;
+  else if (count == 4)
+    opcode = 0b0010;
+  EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
+{
+  if (IsScalar(Rd) && IsScalar(Rn))
+  {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+    int rmode = 0;
+    int opcode = 6;
+    int sf = 0;
+    if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
+    {
+      // GPR to scalar single
+      opcode |= 1;
+    }
+    else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
+    {
+      // Scalar single to GPR - defaults are correct
+    }
+    else
+    {
+      // TODO
+      ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
+    }
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+  }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm)
+{
+  EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+  EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+  EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                          ARM64Reg Ra, int opcode)
+{
+  int type = isDouble ? 1 : 0;
+  Rd = DecodeReg(Rd);
+  Rn = DecodeReg(Rn);
+  Rm = DecodeReg(Rm);
+  Ra = DecodeReg(Ra);
+  int o1 = opcode >> 1;
+  int o0 = opcode & 1;
+  m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) |
+                  (Rn << 5) | Rd);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
+{
+  EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  int imm = size * 2 - scale;
+  EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+    imm5 = 1;
+  else if (size == 16)
+    imm5 = 2;
+  else if (size == 32)
+    imm5 = 4;
+  else if (size == 64)
+    imm5 = 8;
+
+  EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn)
+{
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2)
+{
+  u32 imm5 = 0, imm4 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index1 << 1;
+    imm4 = index2;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index1 << 2;
+    imm4 = index2 << 1;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index1 << 3;
+    imm4 = index2 << 2;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index1 << 4;
+    imm4 = index2 << 3;
+  }
+
+  EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64),
+             "%s must have a size of 64 when destination is 64bit!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+  else if (size == 64)
+  {
+    imm5 = 8;
+    imm5 |= index << 4;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+  bool b64Bit = Is64Bit(Rd);
+  ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+  ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+  u32 imm5 = 0;
+
+  if (size == 8)
+  {
+    imm5 = 1;
+    imm5 |= index << 1;
+  }
+  else if (size == 16)
+  {
+    imm5 = 2;
+    imm5 |= index << 2;
+  }
+  else if (size == 32)
+  {
+    imm5 = 4;
+    imm5 |= index << 3;
+  }
+
+  EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
+{
+  u32 dst_encoding = 0;
+  u32 src_encoding = 0;
+
+  if (size_to == 16)
+    dst_encoding = 3;
+  else if (size_to == 32)
+    dst_encoding = 0;
+  else if (size_to == 64)
+    dst_encoding = 1;
+
+  if (size_from == 16)
+    src_encoding = 3;
+  else if (size_from == 32)
+    src_encoding = 0;
+  else if (size_from == 64)
+    src_encoding = 1;
+
+  Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = false;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+    EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+  if (IsScalar(Rn))
+  {
+    // Source is in FP register (like destination!). We must use a vector encoding.
+    bool sign = true;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    int sz = IsDouble(Rn);
+    Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+  }
+  else
+  {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+      type = 1;
+
+    EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+  }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+  bool sf = Is64Bit(Rn);
+  u32 type = 0;
+  if (IsDouble(Rd))
+    type = 1;
+
+  EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn)
+{
+  EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+  EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (src_size == 8)
+  {
+    immh = 1;
+  }
+  else if (src_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (src_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+  ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!",
+             __func__);
+  u32 immh = 0;
+  u32 immb = shift & 0xFFF;
+
+  if (dest_size == 8)
+  {
+    immh = 1;
+  }
+  else if (dest_size == 16)
+  {
+    immh = 2 | ((shift >> 3) & 1);
+  }
+  else if (dest_size == 32)
+  {
+    immh = 4 | ((shift >> 3) & 3);
+    ;
+  }
+  EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+  USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+  ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+  bool L = false;
+  bool H = false;
+  if (size == 32)
+  {
+    L = index & 1;
+    H = (index >> 1) & 1;
+  }
+  else if (size == 64)
+  {
+    H = index == 1;
+  }
+
+  EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 0;
+  u8 op = 0;
+  u8 abcdefgh = imm & 0xFF;
+  if (size == 8)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+  }
+  else if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else  // 64
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+    op = 1;
+    cmode = 0xE;
+    abcdefgh = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+      u8 tmp = (imm >> (i << 3)) & 0xFF;
+      ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+      if (tmp == 0xFF)
+        abcdefgh |= (1 << i);
+    }
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+  bool Q = IsQuad(Rd);
+  u8 cmode = 1;
+  u8 op = 1;
+  if (size == 16)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+               __func__);
+
+    if (shift == 8)
+      cmode |= 2;
+  }
+  else if (size == 32)
+  {
+    ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+               "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+    // XXX: Implement support for MOVI - shifting ones variant
+    switch (shift)
+    {
+    case 8:
+      cmode |= 2;
+      break;
+    case 16:
+      cmode |= 4;
+      break;
+    case 24:
+      cmode |= 6;
+      break;
+    default:
+      break;
+    }
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__);
+  }
+  EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    int num_regs = registers.Count();
+    m_emit->SUB(SP, SP, num_regs * 16);
+    m_emit->ADD(tmp, SP, 0);
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+
+      // 0 = true
+      // 1 < 4 && registers[i + 1] true!
+      // 2 < 4 && registers[i + 2] true!
+      // 3 < 4 && registers[i + 3] true!
+      // 4 < 4 && registers[i + 4] false!
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+  }
+  else
+  {
+    std::vector<ARM64Reg> pair_regs;
+    for (auto it : registers)
+    {
+      pair_regs.push_back((ARM64Reg)(Q0 + it));
+      if (pair_regs.size() == 2)
+      {
+        STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+  }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+  bool bundled_loadstore = false;
+  int num_regs = registers.Count();
+
+  for (int i = 0; i < 32; ++i)
+  {
+    if (!registers[i])
+      continue;
+
+    int count = 0;
+    while (++count < 4 && (i + count) < 32 && registers[i + count])
+    {
+    }
+    if (count > 1)
+    {
+      bundled_loadstore = true;
+      break;
+    }
+  }
+
+  if (bundled_loadstore && tmp != INVALID_REG)
+  {
+    // The temporary register is only used to indicate that we can use this code path
+    std::vector<ARM64Reg> island_regs;
+    for (int i = 0; i < 32; ++i)
+    {
+      if (!registers[i])
+        continue;
+
+      int count = 0;
+      while (++count < 4 && (i + count) < 32 && registers[i + count])
+      {
+      }
+
+      if (count == 1)
+        island_regs.push_back((ARM64Reg)(Q0 + i));
+      else
+        LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+      i += count - 1;
+    }
+
+    // Handle island registers
+    std::vector<ARM64Reg> pair_regs;
+    for (auto& it : island_regs)
+    {
+      pair_regs.push_back(it);
+      if (pair_regs.size() == 2)
+      {
+        LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+        pair_regs.clear();
+      }
+    }
+    if (pair_regs.size())
+      LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+  }
+  else
+  {
+    bool odd = num_regs % 2;
+    std::vector<ARM64Reg> pair_regs;
+    for (int i = 31; i >= 0; --i)
+    {
+      if (!registers[i])
+        continue;
+
+      if (odd)
+      {
+        // First load must be a regular LDR if odd
+        odd = false;
+        LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+      }
+      else
+      {
+        pair_regs.push_back((ARM64Reg)(Q0 + i));
+        if (pair_regs.size() == 2)
+        {
+          LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+          pair_regs.clear();
+        }
+      }
+    }
+  }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (!Is64Bit(Rn))
+    imm &= 0xFFFFFFFF;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    AND(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ORRI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ORR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "EORI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    EOR(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  unsigned int n, imm_s, imm_r;
+  if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+  {
+    ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "ANDSI2R - failed to construct logical immediate value from %08x, need scratch",
+               (u32)imm);
+    MOVI2R(scratch, imm);
+    ANDS(Rd, Rn, scratch);
+  }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+                                 bool flags)
+{
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, imm, shift);
+    break;
+  case 1:
+    ADDS(Rd, Rn, imm, shift);
+    break;
+  case 2:
+    SUB(Rd, Rn, imm, shift);
+    break;
+  case 3:
+    SUBS(Rd, Rn, imm, shift);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                                    ARM64Reg scratch)
+{
+  bool has_scratch = scratch != INVALID_REG;
+  u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
+  bool neg_neg = negative ? false : true;
+
+  // Fast paths, aarch64 immediate instructions
+  // Try them all first
+  if (imm <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm, false, negative, flags);
+    return;
+  }
+  if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFF)
+  {
+    AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+    return;
+  }
+  if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0)
+  {
+    AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+    return;
+  }
+
+  // ADD+ADD is slower than MOVK+ADD, but inplace.
+  // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+  // As this splits the addition in two parts, this must not be done on setting flags.
+  if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+    AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+    return;
+  }
+  if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u)
+  {
+    AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+    AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+    return;
+  }
+
+  ASSERT_MSG(DYNA_REC, has_scratch,
+             "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch",
+             (u32)imm);
+
+  negative ^= MOVI2R2(scratch, imm, imm_neg);
+  switch ((negative << 1) | flags)
+  {
+  case 0:
+    ADD(Rd, Rn, scratch);
+    break;
+  case 1:
+    ADDS(Rd, Rn, scratch);
+    break;
+  case 2:
+    SUB(Rd, Rn, scratch);
+    break;
+  case 3:
+    SUBS(Rd, Rn, scratch);
+    break;
+  }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+  ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    ADD(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    SUB(Rd, Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm)
+{
+  u32 val;
+  bool shift;
+  if (IsImmArithmetic(imm, &val, &shift))
+    CMP(Rn, val, shift);
+  else
+    return false;
+
+  return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    AND(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    ORR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+  u32 n, imm_r, imm_s;
+  if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+    EOR(Rd, Rn, imm_r, imm_s, n != 0);
+  else
+    return false;
+
+  return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate)
+{
+  ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision");
+  uint8_t imm8;
+  if (value == 0.0)
+  {
+    FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+    if (negate)
+      FNEG(Rd, Rd);
+    // TODO: There are some other values we could generate with the float-imm instruction, like
+    // 1.0...
+  }
+  else if (FPImm8FromFloat(value, &imm8))
+  {
+    FMOV(Rd, imm8);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+               "Failed to find a way to generate FP immediate %f without scratch", value);
+    if (negate)
+      value = -value;
+
+    const u32 ival = Common::BitCast<u32>(value);
+    m_emit->MOVI2R(scratch, ival);
+    FMOV(Rd, scratch);
+  }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch)
+{
+  // TODO: Make it work with more element sizes
+  // TODO: Optimize - there are shorter solution for many values
+  ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
+  MOVI2F(s, value, scratch);
+  DUP(32, Rd, Rd, 0);
+}
+
+}  // namespace Arm64Gen
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
new file mode 100644
index 0000000..3d9d4ba
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.h
@@ -0,0 +1,1151 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+
+#include "ArmCommon.h"
+#include "BitSet.h"
+#include "Compat.h"
+
+namespace Arm64Gen
+{
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg
+{
+  // 32bit registers
+  W0 = 0,
+  W1,
+  W2,
+  W3,
+  W4,
+  W5,
+  W6,
+  W7,
+  W8,
+  W9,
+  W10,
+  W11,
+  W12,
+  W13,
+  W14,
+  W15,
+  W16,
+  W17,
+  W18,
+  W19,
+  W20,
+  W21,
+  W22,
+  W23,
+  W24,
+  W25,
+  W26,
+  W27,
+  W28,
+  W29,
+  W30,
+
+  WSP,  // 32bit stack pointer
+
+  // 64bit registers
+  X0 = 0x20,
+  X1,
+  X2,
+  X3,
+  X4,
+  X5,
+  X6,
+  X7,
+  X8,
+  X9,
+  X10,
+  X11,
+  X12,
+  X13,
+  X14,
+  X15,
+  X16,
+  X17,
+  X18,
+  X19,
+  X20,
+  X21,
+  X22,
+  X23,
+  X24,
+  X25,
+  X26,
+  X27,
+  X28,
+  X29,
+  X30,
+
+  SP,  // 64bit stack pointer
+
+  // VFP single precision registers
+  S0 = 0x40,
+  S1,
+  S2,
+  S3,
+  S4,
+  S5,
+  S6,
+  S7,
+  S8,
+  S9,
+  S10,
+  S11,
+  S12,
+  S13,
+  S14,
+  S15,
+  S16,
+  S17,
+  S18,
+  S19,
+  S20,
+  S21,
+  S22,
+  S23,
+  S24,
+  S25,
+  S26,
+  S27,
+  S28,
+  S29,
+  S30,
+  S31,
+
+  // VFP Double Precision registers
+  D0 = 0x80,
+  D1,
+  D2,
+  D3,
+  D4,
+  D5,
+  D6,
+  D7,
+  D8,
+  D9,
+  D10,
+  D11,
+  D12,
+  D13,
+  D14,
+  D15,
+  D16,
+  D17,
+  D18,
+  D19,
+  D20,
+  D21,
+  D22,
+  D23,
+  D24,
+  D25,
+  D26,
+  D27,
+  D28,
+  D29,
+  D30,
+  D31,
+
+  // ASIMD Quad-Word registers
+  Q0 = 0xC0,
+  Q1,
+  Q2,
+  Q3,
+  Q4,
+  Q5,
+  Q6,
+  Q7,
+  Q8,
+  Q9,
+  Q10,
+  Q11,
+  Q12,
+  Q13,
+  Q14,
+  Q15,
+  Q16,
+  Q17,
+  Q18,
+  Q19,
+  Q20,
+  Q21,
+  Q22,
+  Q23,
+  Q24,
+  Q25,
+  Q26,
+  Q27,
+  Q28,
+  Q29,
+  Q30,
+  Q31,
+
+  // For PRFM(prefetch memory) encoding
+  // This is encoded in the Rt register
+  // Data preload
+  PLDL1KEEP = 0,
+  PLDL1STRM,
+  PLDL2KEEP,
+  PLDL2STRM,
+  PLDL3KEEP,
+  PLDL3STRM,
+  // Instruction preload
+  PLIL1KEEP = 8,
+  PLIL1STRM,
+  PLIL2KEEP,
+  PLIL2STRM,
+  PLIL3KEEP,
+  PLIL3STRM,
+  // Prepare for store
+  PLTL1KEEP = 16,
+  PLTL1STRM,
+  PLTL2KEEP,
+  PLTL2STRM,
+  PLTL3KEEP,
+  PLTL3STRM,
+
+  WZR = WSP,
+  ZR = SP,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg)
+{
+  return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg)
+{
+  return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg)
+{
+  return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg)
+{
+  return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg)
+{
+  return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
+{
+  return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType
+{
+  TYPE_IMM = 0,
+  TYPE_REG,
+  TYPE_IMMSREG,
+  TYPE_RSR,
+  TYPE_MEM
+};
+
+enum ShiftType
+{
+  ST_LSL = 0,
+  ST_LSR = 1,
+  ST_ASR = 2,
+  ST_ROR = 3,
+};
+
+enum IndexType
+{
+  INDEX_UNSIGNED,
+  INDEX_POST,
+  INDEX_PRE,
+  INDEX_SIGNED,  // used in LDP/STP
+};
+
+enum ShiftAmount
+{
+  SHIFT_0 = 0,
+  SHIFT_16 = 1,
+  SHIFT_32 = 2,
+  SHIFT_48 = 3,
+};
+
+enum RoundingMode
+{
+  ROUND_A,  // round to nearest, ties to away
+  ROUND_M,  // round towards -inf
+  ROUND_N,  // round to nearest, ties to even
+  ROUND_P,  // round towards +inf
+  ROUND_Z,  // round towards zero
+};
+
+struct FixupBranch
+{
+  ptrdiff_t ptr;
+  // Type defines
+  // 0 = CBZ (32bit)
+  // 1 = CBNZ (32bit)
+  // 2 = B (conditional)
+  // 3 = TBZ
+  // 4 = TBNZ
+  // 5 = B (unconditional)
+  // 6 = BL (unconditional)
+  u32 type;
+
+  // Used with B.cond
+  CCFlags cond;
+
+  // Used with TBZ/TBNZ
+  u8 bit;
+
+  // Used with Test/Compare and Branch
+  ARM64Reg reg;
+};
+
+enum PStateField
+{
+  FIELD_SPSel = 0,
+  FIELD_DAIFSet,
+  FIELD_DAIFClr,
+  FIELD_NZCV,  // The only system registers accessible from EL0 (user space)
+  FIELD_PMCR_EL0,
+  FIELD_PMCCNTR_EL0,
+  FIELD_FPCR = 0x340,
+  FIELD_FPSR = 0x341,
+};
+
+enum SystemHint
+{
+  HINT_NOP = 0,
+  HINT_YIELD,
+  HINT_WFE,
+  HINT_WFI,
+  HINT_SEV,
+  HINT_SEVL,
+};
+
+enum BarrierType
+{
+  OSHLD = 1,
+  OSHST = 2,
+  OSH = 3,
+  NSHLD = 5,
+  NSHST = 6,
+  NSH = 7,
+  ISHLD = 9,
+  ISHST = 10,
+  ISH = 11,
+  LD = 13,
+  ST = 14,
+  SY = 15,
+};
+
+class ArithOption
+{
+public:
+  enum WidthSpecifier
+  {
+    WIDTH_DEFAULT,
+    WIDTH_32BIT,
+    WIDTH_64BIT,
+  };
+
+  enum ExtendSpecifier
+  {
+    EXTEND_UXTB = 0x0,
+    EXTEND_UXTH = 0x1,
+    EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+    EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+    EXTEND_SXTB = 0x4,
+    EXTEND_SXTH = 0x5,
+    EXTEND_SXTW = 0x6,
+    EXTEND_SXTX = 0x7,
+  };
+
+  enum TypeSpecifier
+  {
+    TYPE_EXTENDEDREG,
+    TYPE_IMM,
+    TYPE_SHIFTEDREG,
+  };
+
+private:
+  ARM64Reg m_destReg;
+  WidthSpecifier m_width;
+  ExtendSpecifier m_extend;
+  TypeSpecifier m_type;
+  ShiftType m_shifttype;
+  u32 m_shift;
+
+public:
+  ArithOption(ARM64Reg Rd, bool index = false)
+  {
+    // Indexed registers are a certain feature of AARch64
+    // On Loadstore instructions that use a register offset
+    // We can have the register as an index
+    // If we are indexing then the offset register will
+    // be shifted to the left so we are indexing at intervals
+    // of the size of what we are loading
+    // 8-bit: Index does nothing
+    // 16-bit: Index LSL 1
+    // 32-bit: Index LSL 2
+    // 64-bit: Index LSL 3
+    if (index)
+      m_shift = 4;
+    else
+      m_shift = 0;
+
+    m_destReg = Rd;
+    m_type = TYPE_EXTENDEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      m_extend = EXTEND_UXTX;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      m_extend = EXTEND_UXTW;
+    }
+    m_shifttype = ST_LSL;
+  }
+  ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
+  {
+    m_destReg = Rd;
+    m_shift = shift;
+    m_shifttype = shift_type;
+    m_type = TYPE_SHIFTEDREG;
+    if (Is64Bit(Rd))
+    {
+      m_width = WIDTH_64BIT;
+      if (shift == 64)
+        m_shift = 0;
+    }
+    else
+    {
+      m_width = WIDTH_32BIT;
+      if (shift == 32)
+        m_shift = 0;
+    }
+  }
+  TypeSpecifier GetType() const { return m_type; }
+  ARM64Reg GetReg() const { return m_destReg; }
+  u32 GetData() const
+  {
+    switch (m_type)
+    {
+    case TYPE_EXTENDEDREG:
+      return (m_extend << 13) | (m_shift << 10);
+      break;
+    case TYPE_SHIFTEDREG:
+      return (m_shifttype << 22) | (m_shift << 10);
+      break;
+    default:
+      DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
+      break;
+    }
+    return 0;
+  }
+};
+
+class ARM64XEmitter
+{
+  friend class ARM64FloatEmitter;
+
+private:
+  ptrdiff_t m_code;
+  ptrdiff_t m_lastCacheFlushEnd;
+  u8* m_rwbase;
+  u8* m_rxbase;
+
+  void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+  void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+  void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+  void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+  void EncodeExceptionInst(u32 instenc, u32 imm);
+  void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+  void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                            ArithOption Option);
+  void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+  void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
+  void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+  void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+  void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+  void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+  void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+  void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                           s32 imm);
+  void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+  void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+  // TODO: make this less ugly
+  // used for Switch where memory is executable and writeable and different addresses
+  // we need to take this for relative addressing in account
+
+  void Write32(u32 value);
+
+public:
+  ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {}
+  ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset)
+  {
+    m_rwbase = rwbase;
+    m_rxbase = rxbase;
+    m_code = offset;
+    m_lastCacheFlushEnd = offset;
+  }
+
+  virtual ~ARM64XEmitter() {}
+  void SetCodePtr(ptrdiff_t ptr);
+  void SetCodePtrUnsafe(ptrdiff_t ptr);
+  void SetCodeBase(u8* rwbase, u8* rxbase);
+  void ReserveCodeSpace(u32 bytes);
+  ptrdiff_t AlignCode16();
+  ptrdiff_t AlignCodePage();
+  ptrdiff_t GetCodeOffset();
+  const u8* GetRWPtr();
+  u8* GetWriteableRWPtr();
+  void* GetRXPtr();
+  void FlushIcache();
+  void FlushIcacheSection(u8* start, u8* end);
+
+  // FixupBranch branching
+  void SetJumpTarget(FixupBranch const& branch);
+  FixupBranch CBZ(ARM64Reg Rt);
+  FixupBranch CBNZ(ARM64Reg Rt);
+  FixupBranch B(CCFlags cond);
+  FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+  FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+  FixupBranch B();
+  FixupBranch BL();
+
+  // Compare and Branch
+  void CBZ(ARM64Reg Rt, const void* ptr);
+  void CBNZ(ARM64Reg Rt, const void* ptr);
+
+  // Conditional Branch
+  void B(CCFlags cond, const void* ptr);
+
+  // Test and Branch
+  void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+  void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+  // Unconditional Branch
+  void B(const void* ptr);
+  void BL(const void* ptr);
+
+  // Unconditional Branch (register)
+  void BR(ARM64Reg Rn);
+  void BLR(ARM64Reg Rn);
+  void RET(ARM64Reg Rn = X30);
+  void ERET();
+  void DRPS();
+
+  // Exception generation
+  void SVC(u32 imm);
+  void HVC(u32 imm);
+  void SMC(u32 imm);
+  void BRK(u32 imm);
+  void HLT(u32 imm);
+  void DCPS1(u32 imm);
+  void DCPS2(u32 imm);
+  void DCPS3(u32 imm);
+
+  // System
+  void _MSR(PStateField field, u8 imm);
+  void _MSR(PStateField field, ARM64Reg Rt);
+  void MRS(ARM64Reg Rt, PStateField field);
+  void CNTVCT(ARM64Reg Rt);
+
+  void HINT(SystemHint op);
+  void CLREX();
+  void DSB(BarrierType type);
+  void DMB(BarrierType type);
+  void ISB(BarrierType type);
+
+  // Add/Subtract (Extended/Shifted register)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm);
+  void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm);
+  void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+  // Add/Subtract (with carry)
+  void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Conditional Compare (immediate)
+  void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+  // Conditional Compare (register)
+  void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+  void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+  // Conditional Select
+  void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+  void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Aliases
+  void CSET(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void CSETM(ARM64Reg Rd, CCFlags cond)
+  {
+    ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+    CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+  }
+  void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); }
+  // Data-Processing 1 source
+  void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+  void REV16(ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(ARM64Reg Rd, ARM64Reg Rn);
+  void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+  void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Data-Processing 2 source
+  void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Data-Processing 3 source
+  void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Logical (shifted register)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+  // Wrap the above for saner syntax
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+  // Convenience wrappers around ORR. These match the official convenience syntax.
+  void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+  void MOV(ARM64Reg Rd, ARM64Reg Rm);
+  void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+  // Convenience wrappers around UBFM/EXTR.
+  void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+  void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+  // Logical (immediate)
+  void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+  void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); }
+  // Add/subtract (immediate)
+  void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+  void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+  // Data Processing (Immediate)
+  void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+  void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+  // Bitfield move
+  void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+  void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+  void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+  // Extract register (ROR with two inputs, if same then faster on A67)
+  void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+  // Aliases
+  void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+  void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+  void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+
+  void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
+  // Load Register (Literal)
+  void LDR(ARM64Reg Rt, u32 imm);
+  void LDRSW(ARM64Reg Rt, u32 imm);
+  void PRFM(ARM64Reg Rt, u32 imm);
+
+  // Load/Store Exclusive
+  void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+  void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+  void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+  void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+  void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+  void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+  void STLR(ARM64Reg Rt, ARM64Reg Rn);
+  void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+  // Load/Store no-allocate pair (offset)
+  void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+  void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+  // Load/Store register (immediate indexed)
+  void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store register (register offset)
+  void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Load/Store register (unscaled offset)
+  void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Load/Store pair
+  void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Address of label/page PC-relative
+  void ADR(ARM64Reg Rd, s32 imm);
+  void ADRP(ARM64Reg Rd, s32 imm);
+
+  // Wrapper around MOVZ+MOVK
+  void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+  bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+  template <class P>
+  void MOVP2R(ARM64Reg Rd, P* ptr)
+  {
+    ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+    MOVI2R(Rd, (uintptr_t)ptr);
+  }
+
+  // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch
+  // register.
+  void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG)
+  {
+    ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+  }
+  void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                       ARM64Reg scratch);
+  void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+  void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+  bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+  bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+  bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers);
+  void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  // This function expects you to have set up the state.
+  // Overwrites X0 and X30
+  template <typename T, typename... Args>
+  ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
+  {
+    auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+    MOVI2R(X30, (uintptr_t)trampoline);
+    MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+    return X30;
+  }
+
+  void QuickTailCall(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickTailCall(ARM64Reg scratchreg, T func)
+  {
+    QuickTailCall(scratchreg, (const void*)func);
+  }
+
+  // Plain function call
+  void QuickCallFunction(ARM64Reg scratchreg, const void* func);
+  template <typename T>
+  void QuickCallFunction(ARM64Reg scratchreg, T func)
+  {
+    QuickCallFunction(scratchreg, (const void*)func);
+  }
+};
+
+class ARM64FloatEmitter
+{
+public:
+  ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
+  void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore unscaled
+  void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+  // Loadstore single structure
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+  void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Loadstore multiple structure
+  void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+  void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+  void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+  // Loadstore paired
+  void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+  void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+  // Loadstore register offset
+  void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+  // Scalar - 1 Source
+  void FABS(ARM64Reg Rd, ARM64Reg Rn);
+  void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+  void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+  void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP
+
+  // Scalar - 2 Source
+  void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Scalar - 3 Source. Note - the accumulator is last on ARM!
+  void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+  void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+  // Scalar floating point immediate
+  void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+  // Vector
+  void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void NOT(ARM64Reg Rd, ARM64Reg Rn);
+  void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
+  void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+  void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Move
+  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+  void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+  void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+  // One source
+  void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar convert float to int, in a lot of variants.
+  // Note that the scalar version of this operation has two encodings, one that goes to an integer
+  // register
+  // and one that outputs to a scalar fp register.
+  void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+  void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+  // Scalar convert int to float. No rounding mode specifier necessary.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+  // Scalar fixed point to float. scale is the number of fractional bits.
+  void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+  void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+  // Float comparison
+  void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMP(ARM64Reg Rn);
+  void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+  void FCMPE(ARM64Reg Rn);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // Conditional select
+  void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+  // Permute
+  void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+  // Shift by immediate
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+  void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+  // vector x indexed element
+  void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+  void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+  // Modified Immediate
+  void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+  void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+  void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+  void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+  // ABI related
+  void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+  void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+  ARM64XEmitter* m_emit;
+  inline void Write32(u32 value) { m_emit->Write32(value); }
+  // Emitting functions
+  void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                         ARM64Reg Rm);
+  void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+  void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn);
+  void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                    ARM64Reg Rn, ARM64Reg Rm);
+  void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
+                       ARM64Reg Rd, ARM64Reg Rn);
+  void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+  void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+  void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+                                          ARM64Reg Rm);
+  void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+  void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+                          ARM64Reg Rm);
+  void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+  void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+  void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+                         int opcode);
+  void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                           ARM64Reg Rn, s32 imm);
+  void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+  void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+  void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+  void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+}
+\ No newline at end of file
diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h
new file mode 100644
index 0000000..6d82e9d
--- /dev/null
+++ b/src/dolphin/ArmCommon.h
@@ -0,0 +1,27 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "../types.h"
+
+enum CCFlags
+{
+  CC_EQ = 0,      // Equal
+  CC_NEQ,         // Not equal
+  CC_CS,          // Carry Set
+  CC_CC,          // Carry Clear
+  CC_MI,          // Minus (Negative)
+  CC_PL,          // Plus
+  CC_VS,          // Overflow
+  CC_VC,          // No Overflow
+  CC_HI,          // Unsigned higher
+  CC_LS,          // Unsigned lower or same
+  CC_GE,          // Signed greater than or equal
+  CC_LT,          // Signed less than
+  CC_GT,          // Signed greater than
+  CC_LE,          // Signed less than or equal
+  CC_AL,          // Always (unconditional) 14
+  CC_HS = CC_CS,  // Alias of CC_CS  Unsigned higher or same
+  CC_LO = CC_CC,  // Alias of CC_CC  Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h
new file mode 100644
index 0000000..d32b020
--- /dev/null
+++ b/src/dolphin/BitSet.h
@@ -0,0 +1,218 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "../types.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+namespace Common
+{
+template <typename T>
+constexpr int CountSetBits(T v)
+{
+  // from https://graphics.stanford.edu/~seander/bithacks.html
+  // GCC has this built in, but MSVC's intrinsic will only emit the actual
+  // POPCNT instruction, which we're not depending on
+  v = v - ((v >> 1) & (T) ~(T)0 / 3);
+  v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3);
+  v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15;
+  return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8;
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  unsigned long index;
+  _BitScanForward(&index, val);
+  return (int)index;
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  unsigned long index;
+  _BitScanForward64(&index, val);
+  return (int)index;
+}
+#else
+namespace Common
+{
+constexpr int CountSetBits(u8 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u16 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u32 val)
+{
+  return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u64 val)
+{
+  return __builtin_popcountll(val);
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+  return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+  return __builtin_ctzll(val);
+}
+#endif
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+  static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+
+public:
+  // A reference to a particular bit, returned from operator[].
+  class Ref
+  {
+  public:
+    constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+    constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+    constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+    bool operator=(bool set)
+    {
+      m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+      return set;
+    }
+
+  private:
+    BitSet* m_bs;
+    IntTy m_mask;
+  };
+
+  // A STL-like iterator is required to be able to use range-based for loops.
+  class Iterator
+  {
+  public:
+    constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+    constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+    Iterator& operator=(Iterator other)
+    {
+      new (this) Iterator(other);
+      return *this;
+    }
+    Iterator& operator++()
+    {
+      if (m_val == 0)
+      {
+        m_bit = -1;
+      }
+      else
+      {
+        int bit = LeastSignificantSetBit(m_val);
+        m_val &= ~(1 << bit);
+        m_bit = bit;
+      }
+      return *this;
+    }
+    Iterator operator++(int)
+    {
+      Iterator other(*this);
+      ++*this;
+      return other;
+    }
+    constexpr int operator*() const { return m_bit; }
+    constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+    constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+
+  private:
+    IntTy m_val;
+    int m_bit;
+  };
+
+  constexpr BitSet() : m_val(0) {}
+  constexpr explicit BitSet(IntTy val) : m_val(val) {}
+  BitSet(std::initializer_list<int> init)
+  {
+    m_val = 0;
+    for (int bit : init)
+      m_val |= (IntTy)1 << bit;
+  }
+
+  constexpr static BitSet AllTrue(size_t count)
+  {
+    return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+  }
+
+  Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+  constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+  constexpr bool operator==(BitSet other) const { return m_val == other.m_val; }
+  constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; }
+  constexpr bool operator<(BitSet other) const { return m_val < other.m_val; }
+  constexpr bool operator>(BitSet other) const { return m_val > other.m_val; }
+  constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+  constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+  constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+  constexpr BitSet operator~() const { return BitSet(~m_val); }
+  constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); }
+  constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); }
+  constexpr explicit operator bool() const { return m_val != 0; }
+  BitSet& operator|=(BitSet other) { return *this = *this | other; }
+  BitSet& operator&=(BitSet other) { return *this = *this & other; }
+  BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+  BitSet& operator<<=(IntTy shift) { return *this = *this << shift; }
+  BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; }
+  // Warning: Even though on modern CPUs this is a single fast instruction,
+  // Dolphin's official builds do not currently assume POPCNT support on x86,
+  // so slower explicit bit twiddling is generated.  Still should generally
+  // be faster than a loop.
+  constexpr unsigned int Count() const { return CountSetBits(m_val); }
+  constexpr Iterator begin() const { return ++Iterator(m_val, 0); }
+  constexpr Iterator end() const { return Iterator(m_val, -1); }
+  IntTy m_val;
+};
+}  // namespace Common
+
+using BitSet8 = Common::BitSet<u8>;
+using BitSet16 = Common::BitSet<u16>;
+using BitSet32 = Common::BitSet<u32>;
+using BitSet64 = Common::BitSet<u64>;
diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h
new file mode 100644
index 0000000..8b64a92
--- /dev/null
+++ b/src/dolphin/BitUtils.h
@@ -0,0 +1,254 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+
+namespace Common
+{
+///
+/// Retrieves the size of a type in bits.
+///
+/// @tparam T Type to get the size of.
+///
+/// @return the size of the type in bits.
+///
+template <typename T>
+constexpr size_t BitSize() noexcept
+{
+  return sizeof(T) * CHAR_BIT;
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+/// @param  bit The bit to extract.
+///
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <typename T>
+constexpr T ExtractBit(const T src, const size_t bit) noexcept
+{
+  return (src >> bit) & static_cast<T>(1);
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param  src The value to extract a bit from.
+///
+/// @tparam bit The bit to extract.
+/// @tparam T   The type of the value.
+///
+/// @return The extracted bit.
+///
+template <size_t bit, typename T>
+constexpr T ExtractBit(const T src) noexcept
+{
+  static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width.");
+
+  return ExtractBit(src, bit);
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+/// @param  begin  The beginning of the bit range. This is inclusive.
+/// @param  end    The ending of the bit range. This is inclusive.
+///
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept
+{
+  return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >>
+                              (BitSize<T>() - end + begin - 1)));
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param  src    The value to extract the bits from.
+///
+/// @tparam begin  The beginning of the bit range. This is inclusive.
+/// @tparam end    The ending of the bit range. This is inclusive.
+/// @tparam T      The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+///                of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src) noexcept
+{
+  static_assert(begin < end, "Beginning bit must be less than the ending bit.");
+  static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width.");
+  static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width.");
+
+  return ExtractBits<T, Result>(src, begin, end);
+}
+
+///
+/// Rotates a value left (ROL).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateLeft(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount)));
+}
+
+///
+/// Rotates a value right (ROR).
+///
+/// @param  value  The value to rotate.
+/// @param  amount The number of bits to rotate the value.
+/// @tparam T      An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateRight(const T value, size_t amount) noexcept
+{
+  static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right.");
+
+  amount %= BitSize<T>();
+
+  if (amount == 0)
+    return value;
+
+  return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount)));
+}
+
+///
+/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11.
+/// Both edge cases of all zeros and all ones are considered valid masks, too.
+///
+/// @param  mask The mask value to test for validity.
+///
+/// @tparam T    The type of the value.
+///
+/// @return A bool indicating whether the mask is valid.
+///
+template <typename T>
+constexpr bool IsValidLowMask(const T mask) noexcept
+{
+  static_assert(std::is_integral<T>::value, "Mask must be an integral type.");
+  static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs.");
+
+  // Can be efficiently determined without looping or bit counting. It's the counterpart
+  // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+  // and doesn't require special casing either edge case.
+  return (mask & (mask + 1)) == 0;
+}
+
+///
+/// Reinterpret objects of one type as another by bit-casting between object representations.
+///
+/// @remark This is the example implementation of std::bit_cast which is to be included
+///         in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html
+///         for more details. The only difference is this variant is not constexpr,
+///         as the mechanism for bit_cast requires a compiler built-in to have that quality.
+///
+/// @param source The source object to convert to another representation.
+///
+/// @tparam To   The type to reinterpret source as.
+/// @tparam From The initial type representation of source.
+///
+/// @return The representation of type From as type To.
+///
+/// @pre Both To and From types must be the same size
+/// @pre Both To and From types must satisfy the TriviallyCopyable concept.
+///
+template <typename To, typename From>
+inline To BitCast(const From& source) noexcept
+{
+  static_assert(sizeof(From) == sizeof(To),
+                "BitCast source and destination types must be equal in size.");
+  static_assert(std::is_trivially_copyable<From>(),
+                "BitCast source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<To>(),
+                "BitCast destination type must be trivially copyable.");
+
+  std::aligned_storage_t<sizeof(To), alignof(To)> storage;
+  std::memcpy(&storage, &source, sizeof(storage));
+  return reinterpret_cast<To&>(storage);
+}
+
+template <typename T, typename PtrType>
+class BitCastPtrType
+{
+public:
+  static_assert(std::is_trivially_copyable<PtrType>(),
+                "BitCastPtr source type must be trivially copyable.");
+  static_assert(std::is_trivially_copyable<T>(),
+                "BitCastPtr destination type must be trivially copyable.");
+
+  explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {}
+
+  // Enable operator= only for pointers to non-const data
+  template <typename S>
+  inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type
+  operator=(const S& source)
+  {
+    std::memcpy(m_ptr, &source, sizeof(source));
+  }
+
+  inline operator T() const
+  {
+    T result;
+    std::memcpy(&result, m_ptr, sizeof(result));
+    return result;
+  }
+
+private:
+  PtrType* m_ptr;
+};
+
+// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs
+// Conversion constructor and operator= provided for a convenient syntax.
+// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr);
+// BitCastPtr<MyStruct>(some_ptr) = s;
+template <typename T, typename PtrType>
+inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType>
+{
+  return BitCastPtrType<T, PtrType>{ptr};
+}
+
+template <typename T>
+void SetBit(T& value, size_t bit_number, bool bit_value)
+{
+  static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types.");
+
+  if (bit_value)
+    value |= (T{1} << bit_number);
+  else
+    value &= ~(T{1} << bit_number);
+}
+
+}  // namespace Common
diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h
new file mode 100644
index 0000000..bd4fd8d
--- /dev/null
+++ b/src/dolphin/CPUDetect.h
@@ -0,0 +1,76 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// Detect the CPU, so we'll know which optimizations to use
+#pragma once
+
+#include <string>
+
+enum class CPUVendor
+{
+  Intel,
+  AMD,
+  ARM,
+  Other,
+};
+
+struct CPUInfo
+{
+  CPUVendor vendor = CPUVendor::Intel;
+
+  char cpu_string[0x41] = {};
+  char brand_string[0x21] = {};
+  bool OS64bit = false;
+  bool CPU64bit = false;
+  bool Mode64bit = false;
+
+  bool HTT = false;
+  int num_cores = 0;
+  int logical_cpu_count = 0;
+
+  bool bSSE = false;
+  bool bSSE2 = false;
+  bool bSSE3 = false;
+  bool bSSSE3 = false;
+  bool bPOPCNT = false;
+  bool bSSE4_1 = false;
+  bool bSSE4_2 = false;
+  bool bLZCNT = false;
+  bool bSSE4A = false;
+  bool bAVX = false;
+  bool bAVX2 = false;
+  bool bBMI1 = false;
+  bool bBMI2 = false;
+  bool bFMA = false;
+  bool bFMA4 = false;
+  bool bAES = false;
+  // FXSAVE/FXRSTOR
+  bool bFXSR = false;
+  bool bMOVBE = false;
+  // This flag indicates that the hardware supports some mode
+  // in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+  bool bFlushToZero = false;
+  bool bLAHFSAHF64 = false;
+  bool bLongMode = false;
+  bool bAtom = false;
+
+  // ARMv8 specific
+  bool bFP = false;
+  bool bASIMD = false;
+  bool bCRC32 = false;
+  bool bSHA1 = false;
+  bool bSHA2 = false;
+
+  // Call Detect()
+  explicit CPUInfo();
+
+  // Turn the CPU info into a string we can show
+  std::string Summarize();
+
+private:
+  // Detects the various CPU features
+  void Detect();
+};
+
+extern CPUInfo cpu_info;
diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp
new file mode 100644
index 0000000..f85051d
--- /dev/null
+++ b/src/dolphin/CommonFuncs.cpp
@@ -0,0 +1,52 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include <errno.h>
+#include <type_traits>
+
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define strerror_r(err, buf, len) strerror_s(buf, len, err)
+#endif
+
+constexpr size_t BUFFER_SIZE = 256;
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  // There are two variants of strerror_r. The XSI version stores the message to the passed-in
+  // buffer and returns an int (0 on success). The GNU version returns a pointer to the message,
+  // which might have been stored in the passed-in buffer or might be a static string.
+
+  // We check defines in order to figure out variant is in use, and we store the returned value
+  // to a variable so that we'll get a compile-time check that our assumption was correct.
+
+#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600))
+  const char* str = strerror_r(errno, error_message, BUFFER_SIZE);
+  return std::string(str);
+#else
+  int error_code = strerror_r(errno, error_message, BUFFER_SIZE);
+  return error_code == 0 ? std::string(error_message) : "";
+#endif
+}
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString()
+{
+  char error_message[BUFFER_SIZE];
+
+  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(),
+                 MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr);
+  return std::string(error_message);
+}
+#endif
diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h
new file mode 100644
index 0000000..708fbc3
--- /dev/null
+++ b/src/dolphin/CommonFuncs.h
@@ -0,0 +1,58 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "../types.h"
+
+// Will fail to compile on a non-array:
+template <typename T, size_t N>
+constexpr size_t ArraySize(T (&arr)[N])
+{
+  return N;
+}
+
+#ifndef _WIN32
+
+// go to debugger mode
+#define Crash()                                                                                    \
+  {                                                                                                \
+    __builtin_trap();                                                                              \
+  }
+
+#else  // WIN32
+// Function Cross-Compatibility
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define unlink _unlink
+#define vscprintf _vscprintf
+
+// 64 bit offsets for Windows
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define atoll _atoi64
+#define stat _stat64
+#define fstat _fstat64
+#define fileno _fileno
+
+extern "C" {
+__declspec(dllimport) void __stdcall DebugBreak(void);
+}
+#define Crash()                                                                                    \
+  {                                                                                                \
+    DebugBreak();                                                                                  \
+  }
+#endif  // WIN32 ndef
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString();
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString();
+#endif
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
new file mode 100644
index 0000000..787d505
--- /dev/null
+++ b/src/dolphin/Compat.h
@@ -0,0 +1,75 @@
+// Stubs for Assert.h and Log.h
+#pragma once
+
+#include <assert.h>
+
+// Assert stub
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...)                                                           \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (!(_a_))                                                                                    \
+    {                                                                                              \
+      if (!PanicYesNo(_fmt_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...)                                                     \
+  assert(_a_); \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_))                                    \
+    {                                                                                              \
+      ERROR_LOG(_t_, _msg_, ##__VA_ARGS__);                                                        \
+      if (!PanicYesNo(_msg_, ##__VA_ARGS__))                                                       \
+        Crash();                                                                                   \
+    }                                                                                              \
+  } while (0)*/
+
+#define ASSERT(_a_)                                                                                \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    ASSERT_MSG(MASTER_LOG, _a_,                                                                    \
+               _trans("An error occurred.\n\n  Line: %d\n  File: %s\n\nIgnore and continue?"),     \
+               __LINE__, __FILE__);                                                                \
+  } while (0)*/
+
+#define DEBUG_ASSERT(_a_)                                                                          \
+  assert(_a_) \
+  /*do                                                                                               \
+  {                                                                                                \
+    if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG)                                              \
+      ASSERT(_a_);                                                                                 \
+  } while (0)*/
+
+// Log Stub
+#include <cstdio>
+
+#define PanicAlert(fmt, ...) \
+  do \
+  { \
+    printf(fmt "\n", ## __VA_ARGS__); \
+    abort(); \
+  } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+    do \
+    { \
+        printf(fmt "\n", ## __VA_ARGS__); \
+    } while (false)
+
+#if __cplusplus < 201703L
+// cheat
+namespace std
+{
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+  return v < lo ? lo : (v > hi ? hi : v);
+}
+}
+#endif
+\ No newline at end of file
diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp
new file mode 100644
index 0000000..70f2ede
--- /dev/null
+++ b/src/dolphin/MathUtil.cpp
@@ -0,0 +1,13 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "MathUtil.h"
+
+#include <numeric>
+
+// Calculate sum of a float list
+float MathFloatVectorSum(const std::vector<float>& Vec)
+{
+  return std::accumulate(Vec.begin(), Vec.end(), 0.0f);
+}
diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h
new file mode 100644
index 0000000..b1dbbae
--- /dev/null
+++ b/src/dolphin/MathUtil.h
@@ -0,0 +1,121 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "Compat.h"
+
+#include "../types.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace MathUtil
+{
+constexpr double TAU = 6.2831853071795865;
+constexpr double PI = TAU / 2;
+
+template <typename T>
+constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{}))
+{
+  return (T{} < val) - (val < T{});
+}
+
+template <typename T, typename F>
+constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a)
+{
+  return x + (y - x) * a;
+}
+
+template <typename T>
+constexpr bool IsPow2(T imm)
+{
+  return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
+constexpr u32 NextPowerOf2(u32 value)
+{
+  --value;
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  ++value;
+
+  return value;
+}
+
+template <class T>
+struct Rectangle
+{
+  T left{};
+  T top{};
+  T right{};
+  T bottom{};
+
+  constexpr Rectangle() = default;
+
+  constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom)
+      : left(theLeft), top(theTop), right(theRight), bottom(theBottom)
+  {
+  }
+
+  constexpr bool operator==(const Rectangle& r) const
+  {
+    return left == r.left && top == r.top && right == r.right && bottom == r.bottom;
+  }
+
+  T GetWidth() const { return abs(right - left); }
+  T GetHeight() const { return abs(bottom - top); }
+  // If the rectangle is in a coordinate system with a lower-left origin, use
+  // this Clamp.
+  void ClampLL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y2, y1);
+    bottom = std::clamp(bottom, y2, y1);
+  }
+
+  // If the rectangle is in a coordinate system with an upper-left origin,
+  // use this Clamp.
+  void ClampUL(T x1, T y1, T x2, T y2)
+  {
+    left = std::clamp(left, x1, x2);
+    right = std::clamp(right, x1, x2);
+    top = std::clamp(top, y1, y2);
+    bottom = std::clamp(bottom, y1, y2);
+  }
+};
+
+}  // namespace MathUtil
+
+float MathFloatVectorSum(const std::vector<float>&);
+
+// Rounds down. 0 -> undefined
+inline int IntLog2(u64 val)
+{
+#if defined(__GNUC__)
+  return 63 - __builtin_clzll(val);
+
+#elif defined(_MSC_VER)
+  unsigned long result = ULONG_MAX;
+  _BitScanReverse64(&result, val);
+  return result;
+
+#else
+  int result = -1;
+  while (val != 0)
+  {
+    val >>= 1;
+    ++result;
+  }
+  return result;
+#endif
+}
diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/src/dolphin/license_dolphin.txt
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp
new file mode 100644
index 0000000..d86a158
--- /dev/null
+++ b/src/dolphin/x64ABI.cpp
@@ -0,0 +1,119 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include "../types.h"
+#include "x64ABI.h"
+#include "x64Emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                      size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+  size_t shadow = 0;
+#if defined(_WIN32)
+  shadow = 0x20;
+#endif
+
+  int count = (mask & ABI_ALL_GPRS).Count();
+  rsp_alignment -= count * 8;
+  size_t subtraction = 0;
+  int fpr_count = (mask & ABI_ALL_FPRS).Count();
+  if (fpr_count)
+  {
+    // If we have any XMMs to save, we must align the stack here.
+    subtraction = rsp_alignment & 0xf;
+  }
+  subtraction += 16 * fpr_count;
+  size_t xmm_base_subtraction = subtraction;
+  subtraction += needed_frame_size;
+  subtraction += shadow;
+  // Final alignment.
+  rsp_alignment -= subtraction;
+  subtraction += rsp_alignment & 0xf;
+
+  *shadowp = shadow;
+  *subtractionp = subtraction;
+  *xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                                 size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int r : mask& ABI_ALL_GPRS)
+    PUSH((X64Reg)r);
+
+  if (subtraction)
+    SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
+    xmm_offset += 16;
+  }
+
+  return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                              size_t needed_frame_size)
+{
+  size_t shadow, subtraction, xmm_offset;
+  ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+                         &xmm_offset);
+
+  for (int x : mask& ABI_ALL_FPRS)
+  {
+    MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset));
+    xmm_offset += 16;
+  }
+
+  if (subtraction)
+    ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+  for (int r = 15; r >= 0; r--)
+  {
+    if (mask[r])
+      POP((X64Reg)r);
+  }
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2,
+                      Gen::X64Reg src2)
+{
+  if (dst1 == src2 && dst2 == src1)
+  {
+    XCHG(bits, R(src1), R(src2));
+    if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+  else if (src2 != dst1)
+  {
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+  }
+  else
+  {
+    if (dst2 != src2)
+      MOV(bits, R(dst2), R(src2));
+    if (dst1 != src1 && offset1)
+      LEA(bits, dst1, MDisp(src1, offset1));
+    else if (dst1 != src1)
+      MOV(bits, R(dst1), R(src1));
+    else if (offset1)
+      ADD(bits, R(dst1), Imm32(offset1));
+  }
+}
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
new file mode 100644
index 0000000..94336d0
--- /dev/null
+++ b/src/dolphin/x64ABI.h
@@ -0,0 +1,58 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include "BitSet.h"
+#include "x64Reg.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself
+// calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch:      RAX RCX RDX R8 R9 R10 R11
+// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters:   RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save:  RBX RBP R12 R13 R14 R15
+// Parameters:   RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32  // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED                                                                       \
+  (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16,           \
+      XMM4 + 16, XMM5 + 16})
+#else  // 64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS)
+#endif  // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
new file mode 100644
index 0000000..49b51c9
--- /dev/null
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -0,0 +1,273 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstring>
+#include <string>
+
+#include "CPUDetect.h"
+#include "../types.h"
+
+#ifndef _MSVC_VER
+
+#ifdef __FreeBSD__
+#include <unistd.h>
+
+#include <machine/cpufunc.h>
+#include <sys/types.h>
+#endif
+
+static inline void __cpuidex(int info[4], int function_id, int subfunction_id)
+{
+#ifdef __FreeBSD__
+  // Despite the name, this is just do_cpuid() with ECX as second input.
+  cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
+#else
+  info[0] = function_id;     // eax
+  info[2] = subfunction_id;  // ecx
+  __asm__("cpuid"
+          : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+          : "a"(function_id), "c"(subfunction_id));
+#endif
+}
+
+static inline void __cpuid(int info[4], int function_id)
+{
+  return __cpuidex(info, function_id, 0);
+}
+
+#endif  // ifndef _WIN32
+
+#ifdef _MSVC_VER
+
+static u64 xgetbv(u32 index)
+{
+  return _xgetbv(index);
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK;
+
+#else
+
+static u64 xgetbv(u32 index)
+{
+  u32 eax, edx;
+  __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+  return ((u64)edx << 32) | eax;
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0;
+#endif  // ifdef _WIN32
+
+CPUInfo cpu_info;
+
+CPUInfo::CPUInfo()
+{
+  Detect();
+}
+
+// Detects the various CPU features
+void CPUInfo::Detect()
+{
+#ifdef _M_X86_64
+  Mode64bit = true;
+  OS64bit = true;
+#endif
+  num_cores = 1;
+
+  // Set obvious defaults, for extra safety
+  if (Mode64bit)
+  {
+    bSSE = true;
+    bSSE2 = true;
+    bLongMode = true;
+  }
+
+  // Assume CPU supports the CPUID instruction. Those that don't can barely
+  // boot modern OS:es anyway.
+  int cpu_id[4];
+
+  // Detect CPU's CPUID capabilities, and grab CPU string
+  __cpuid(cpu_id, 0x00000000);
+  u32 max_std_fn = cpu_id[0];  // EAX
+  std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int));
+  std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int));
+  std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int));
+  __cpuid(cpu_id, 0x80000000);
+  u32 max_ex_fn = cpu_id[0];
+  if (!strcmp(brand_string, "GenuineIntel"))
+    vendor = CPUVendor::Intel;
+  else if (!strcmp(brand_string, "AuthenticAMD"))
+    vendor = CPUVendor::AMD;
+  else
+    vendor = CPUVendor::Other;
+
+  // Set reasonable default brand string even if brand string not available.
+  strcpy(cpu_string, brand_string);
+
+  // Detect family and other misc stuff.
+  bool ht = false;
+  HTT = ht;
+  logical_cpu_count = 1;
+  if (max_std_fn >= 1)
+  {
+    __cpuid(cpu_id, 0x00000001);
+    int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+    int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+    // Detect people unfortunate enough to be running Dolphin on an Atom
+    if (family == 6 &&
+        (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
+         model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+      bAtom = true;
+    logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
+    ht = (cpu_id[3] >> 28) & 1;
+
+    if ((cpu_id[3] >> 25) & 1)
+      bSSE = true;
+    if ((cpu_id[3] >> 26) & 1)
+      bSSE2 = true;
+    if ((cpu_id[2]) & 1)
+      bSSE3 = true;
+    if ((cpu_id[2] >> 9) & 1)
+      bSSSE3 = true;
+    if ((cpu_id[2] >> 19) & 1)
+      bSSE4_1 = true;
+    if ((cpu_id[2] >> 20) & 1)
+      bSSE4_2 = true;
+    if ((cpu_id[2] >> 22) & 1)
+      bMOVBE = true;
+    if ((cpu_id[2] >> 25) & 1)
+      bAES = true;
+
+    if ((cpu_id[3] >> 24) & 1)
+    {
+      // We can use FXSAVE.
+      bFXSR = true;
+    }
+
+    // AVX support requires 3 separate checks:
+    //  - Is the AVX bit set in CPUID?
+    //  - Is the XSAVE bit set in CPUID?
+    //  - XGETBV result has the XCR bit set.
+    if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+    {
+      if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+      {
+        bAVX = true;
+        if ((cpu_id[2] >> 12) & 1)
+          bFMA = true;
+      }
+    }
+
+    if (max_std_fn >= 7)
+    {
+      __cpuidex(cpu_id, 0x00000007, 0x00000000);
+      // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+      if ((cpu_id[1] >> 5) & 1)
+        bAVX2 = bAVX;
+      if ((cpu_id[1] >> 3) & 1)
+        bBMI1 = true;
+      if ((cpu_id[1] >> 8) & 1)
+        bBMI2 = true;
+    }
+  }
+
+  bFlushToZero = bSSE;
+
+  if (max_ex_fn >= 0x80000004)
+  {
+    // Extract CPU model string
+    __cpuid(cpu_id, 0x80000002);
+    memcpy(cpu_string, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000003);
+    memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id));
+    __cpuid(cpu_id, 0x80000004);
+    memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id));
+  }
+  if (max_ex_fn >= 0x80000001)
+  {
+    // Check for more features.
+    __cpuid(cpu_id, 0x80000001);
+    if (cpu_id[2] & 1)
+      bLAHFSAHF64 = true;
+    if ((cpu_id[2] >> 5) & 1)
+      bLZCNT = true;
+    if ((cpu_id[2] >> 16) & 1)
+      bFMA4 = true;
+    if ((cpu_id[3] >> 29) & 1)
+      bLongMode = true;
+  }
+
+  num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count;
+
+  if (max_ex_fn >= 0x80000008)
+  {
+    // Get number of cores. This is a bit complicated. Following AMD manual here.
+    __cpuid(cpu_id, 0x80000008);
+    int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF;
+    if (apic_id_core_id_size == 0)
+    {
+      if (ht)
+      {
+        // New mechanism for modern Intel CPUs.
+        if (vendor == CPUVendor::Intel)
+        {
+          __cpuidex(cpu_id, 0x00000004, 0x00000000);
+          int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1;
+          HTT = (cores_x_package < logical_cpu_count);
+          cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1;
+          num_cores = (cores_x_package > 1) ? cores_x_package : num_cores;
+          logical_cpu_count /= cores_x_package;
+        }
+      }
+    }
+    else
+    {
+      // Use AMD's new method.
+      num_cores = (cpu_id[2] & 0xFF) + 1;
+    }
+  }
+}
+
+// Turn the CPU info into a string we can show
+std::string CPUInfo::Summarize()
+{
+  std::string sum(cpu_string);
+  sum += " (";
+  sum += brand_string;
+  sum += ")";
+
+  if (bSSE)
+    sum += ", SSE";
+  if (bSSE2)
+  {
+    sum += ", SSE2";
+    if (!bFlushToZero)
+      sum += " (but not DAZ!)";
+  }
+  if (bSSE3)
+    sum += ", SSE3";
+  if (bSSSE3)
+    sum += ", SSSE3";
+  if (bSSE4_1)
+    sum += ", SSE4.1";
+  if (bSSE4_2)
+    sum += ", SSE4.2";
+  if (HTT)
+    sum += ", HTT";
+  if (bAVX)
+    sum += ", AVX";
+  if (bAVX2)
+    sum += ", AVX2";
+  if (bBMI1)
+    sum += ", BMI1";
+  if (bBMI2)
+    sum += ", BMI2";
+  if (bFMA)
+    sum += ", FMA";
+  if (bAES)
+    sum += ", AES";
+  if (bMOVBE)
+    sum += ", MOVBE";
+  if (bLongMode)
+    sum += ", 64-bit support";
+  return sum;
+}
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
new file mode 100644
index 0000000..343f314
--- /dev/null
+++ b/src/dolphin/x64Emitter.cpp
@@ -0,0 +1,3399 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "x64Emitter.h"
+#include "x64Reg.h"
+#include "Compat.h"
+#include "CommonFuncs.h"
+
+namespace Gen
+{
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+  u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] = {
+    {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0},  // ADD
+    {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2},  // ADC
+
+    {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5},  // SUB
+    {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3},  // SBB
+
+    {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4},  // AND
+    {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1},  // OR
+
+    {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6},  // XOR
+    {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0},  // MOV
+
+    {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0},  // TEST (to == from)
+    {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7},  // CMP
+
+    {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7},  // XCHG
+};
+
+enum NormalSSEOps
+{
+  sseCMP = 0xC2,
+  sseADD = 0x58,   // ADD
+  sseSUB = 0x5C,   // SUB
+  sseAND = 0x54,   // AND
+  sseANDN = 0x55,  // ANDN
+  sseOR = 0x56,
+  sseXOR = 0x57,
+  sseMUL = 0x59,          // MUL
+  sseDIV = 0x5E,          // DIV
+  sseMIN = 0x5D,          // MIN
+  sseMAX = 0x5F,          // MAX
+  sseCOMIS = 0x2F,        // COMIS
+  sseUCOMIS = 0x2E,       // UCOMIS
+  sseSQRT = 0x51,         // SQRT
+  sseRCP = 0x53,          // RCP
+  sseRSQRT = 0x52,        // RSQRT (NO DOUBLE PRECISION!!!)
+  sseMOVAPfromRM = 0x28,  // MOVAP from RM
+  sseMOVAPtoRM = 0x29,    // MOVAP to RM
+  sseMOVUPfromRM = 0x10,  // MOVUP from RM
+  sseMOVUPtoRM = 0x11,    // MOVUP to RM
+  sseMOVLPfromRM = 0x12,
+  sseMOVLPtoRM = 0x13,
+  sseMOVHPfromRM = 0x16,
+  sseMOVHPtoRM = 0x17,
+  sseMOVHLPS = 0x12,
+  sseMOVLHPS = 0x16,
+  sseMOVDQfromRM = 0x6F,
+  sseMOVDQtoRM = 0x7F,
+  sseMASKMOVDQU = 0xF7,
+  sseLDDQU = 0xF0,
+  sseSHUF = 0xC6,
+  sseMOVNTDQ = 0xE7,
+  sseMOVNTP = 0x2B,
+};
+
+enum class NormalOp
+{
+  ADD,
+  ADC,
+  SUB,
+  SBB,
+  AND,
+  OR,
+  XOR,
+  MOV,
+  TEST,
+  CMP,
+  XCHG,
+};
+
+enum class FloatOp
+{
+  LD = 0,
+  ST = 2,
+  STP = 3,
+  LD80 = 5,
+  STP80 = 7,
+
+  Invalid = -1,
+};
+
+void XEmitter::SetCodePtr(u8* ptr)
+{
+  code = ptr;
+}
+
+const u8* XEmitter::GetCodePtr() const
+{
+  return code;
+}
+
+u8* XEmitter::GetWritableCodePtr()
+{
+  return code;
+}
+
+void XEmitter::Write8(u8 value)
+{
+  *code++ = value;
+}
+
+void XEmitter::Write16(u16 value)
+{
+  std::memcpy(code, &value, sizeof(u16));
+  code += sizeof(u16);
+}
+
+void XEmitter::Write32(u32 value)
+{
+  std::memcpy(code, &value, sizeof(u32));
+  code += sizeof(u32);
+}
+
+void XEmitter::Write64(u64 value)
+{
+  std::memcpy(code, &value, sizeof(u64));
+  code += sizeof(u64);
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+  for (int i = 0; i < bytes; i++)
+    *code++ = 0xCC;
+}
+
+u8* XEmitter::AlignCodeTo(size_t alignment)
+{
+  ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0,
+             "Alignment must be power of two");
+  u64 c = reinterpret_cast<u64>(code) & (alignment - 1);
+  if (c)
+    ReserveCodeSpace(static_cast<int>(alignment - c));
+  return code;
+}
+
+u8* XEmitter::AlignCode4()
+{
+  return AlignCodeTo(4);
+}
+
+u8* XEmitter::AlignCode16()
+{
+  return AlignCodeTo(16);
+}
+
+u8* XEmitter::AlignCodePage()
+{
+  return AlignCodeTo(4096);
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+  ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+  Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+  Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const
+{
+  if (customOp == -1)
+    customOp = operandReg;
+  u8 op = 0x40;
+  // REX.W (whether operation is a 64-bit operation)
+  if (opBits == 64)
+    op |= 8;
+  // REX.R (whether ModR/M reg field refers to R8-R15.
+  if (customOp & 8)
+    op |= 4;
+  // REX.X (whether ModR/M SIB index field refers to R8-R15)
+  if (indexReg & 8)
+    op |= 2;
+  // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+  if (offsetOrBaseReg & 8)
+    op |= 1;
+  // Write REX if wr have REX bits to write, or if the operation accesses
+  // SIL, DIL, BPL, or SPL.
+  if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+      (opBits == 8 && (customOp & 0x10c) == 4))
+  {
+    emit->Write8(op);
+    // Check the operation doesn't access AH, BH, CH, or DH.
+    DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
+    DEBUG_ASSERT((customOp & 0x100) == 0);
+  }
+}
+
+void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                     int W) const
+{
+  int R = !(regOp1 & 8);
+  int X = !(indexReg & 8);
+  int B = !(offsetOrBaseReg & 8);
+
+  int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+  // do we need any VEX fields that only appear in the three-byte form?
+  if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+  {
+    u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC5);
+    emit->Write8(RvvvvLpp);
+  }
+  else
+  {
+    u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+    u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp;
+    emit->Write8(0xC4);
+    emit->Write8(RXBmmmmm);
+    emit->Write8(WvvvvLpp);
+  }
+}
+
+void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
+                      bool warn_64bit_offset) const
+{
+  if (_operandReg == INVALID_REG)
+    _operandReg = (X64Reg)this->operandReg;
+  int mod = 0;
+  int ireg = indexReg;
+  bool SIB = false;
+  int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+  if (scale == SCALE_RIP)  // Also, on 32-bit, just an immediate address
+  {
+    // Oh, RIP addressing.
+    _offsetOrBaseReg = 5;
+    emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+    // TODO : add some checks
+    u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+    s64 distance = (s64)offset - (s64)ripAddr;
+    ASSERT_MSG(DYNA_REC,
+               (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset,
+               "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset);
+    s32 offs = (s32)distance;
+    emit->Write32((u32)offs);
+    return;
+  }
+
+  if (scale == 0)
+  {
+    // Oh, no memory, Just a reg.
+    mod = 3;  // 11
+  }
+  else
+  {
+    // Ah good, no scaling.
+    if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+    {
+      // Okay, we're good. No SIB necessary.
+      int ioff = (int)offset;
+      if (ioff == 0)
+      {
+        mod = 0;
+      }
+      else if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+    else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+    {
+      SIB = true;
+      mod = 0;
+      _offsetOrBaseReg = 5;
+    }
+    else
+    {
+      if ((_offsetOrBaseReg & 7) == 4)  // this would occupy the SIB encoding :(
+      {
+        // So we have to fake it with SIB encoding :(
+        SIB = true;
+      }
+
+      if (scale >= SCALE_1 && scale < SCALE_ATREG)
+      {
+        SIB = true;
+      }
+
+      if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+      {
+        SIB = true;
+        ireg = _offsetOrBaseReg;
+      }
+
+      // Okay, we're fine. Just disp encoding.
+      // We need displacement. Which size?
+      int ioff = (int)(s64)offset;
+      if (ioff < -128 || ioff > 127)
+      {
+        mod = 2;  // 32-bit displacement
+      }
+      else
+      {
+        mod = 1;  // 8-bit displacement
+      }
+    }
+  }
+
+  // Okay. Time to do the actual writing
+  // ModRM byte:
+  int oreg = _offsetOrBaseReg;
+  if (SIB)
+    oreg = 4;
+
+  emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
+
+  if (SIB)
+  {
+    // SIB byte
+    int ss;
+    switch (scale)
+    {
+    case SCALE_NONE:
+      _offsetOrBaseReg = 4;
+      ss = 0;
+      break;  // RSP
+    case SCALE_1:
+      ss = 0;
+      break;
+    case SCALE_2:
+      ss = 1;
+      break;
+    case SCALE_4:
+      ss = 2;
+      break;
+    case SCALE_8:
+      ss = 3;
+      break;
+    case SCALE_NOBASE_2:
+      ss = 1;
+      break;
+    case SCALE_NOBASE_4:
+      ss = 2;
+      break;
+    case SCALE_NOBASE_8:
+      ss = 3;
+      break;
+    case SCALE_ATREG:
+      ss = 0;
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte");
+      ss = 0;
+      break;
+    }
+    emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7)));
+  }
+
+  if (mod == 1)  // 8-bit disp
+  {
+    emit->Write8((u8)(s8)(s32)offset);
+  }
+  else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8))  // 32-bit disp
+  {
+    emit->Write32((u32)offset);
+  }
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+  w = w ? 1 : 0;
+  r = r ? 1 : 0;
+  x = x ? 1 : 0;
+  b = b ? 1 : 0;
+  u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+  if (rx != 0x40)
+    Write8(rx);
+}
+
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
+{
+  u64 fn = (u64)addr;
+  if (!force5Bytes)
+  {
+    s64 distance = (s64)(fn - ((u64)code + 2));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    // 8 bits will do
+    Write8(0xEB);
+    Write8((u8)(s8)distance);
+  }
+  else
+  {
+    s64 distance = (s64)(fn - ((u64)code + 5));
+
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0xE9);
+    Write32((u32)(s32)distance);
+  }
+}
+
+void XEmitter::JMPptr(const OpArg& arg2)
+{
+  OpArg arg = arg2;
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument");
+  arg.operandReg = 4;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+// Can be used to trap other processors, before overwriting their code
+// not used in Dolphin
+void XEmitter::JMPself()
+{
+  Write8(0xEB);
+  Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+  if (arg.IsImm())
+    ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument");
+  arg.operandReg = 2;
+  arg.WriteREX(this, 0, 0);
+  Write8(0xFF);
+  arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void* fnptr)
+{
+  u64 distance = u64(fnptr) - (u64(code) + 5);
+  ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL,
+             "CALL out of range (%p calls %p)", code, fnptr);
+  Write8(0xE8);
+  Write32(u32(distance));
+}
+
+FixupBranch XEmitter::CALL()
+{
+  FixupBranch branch;
+  branch.type = FixupBranch::Type::Branch32Bit;
+  branch.ptr = code + 5;
+  Write8(0xE8);
+  Write32(0);
+  return branch;
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 5 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0xEB);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0xE9);
+    Write32(0);
+  }
+  return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+  FixupBranch branch;
+  branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+  branch.ptr = code + (force5bytes ? 6 : 2);
+  if (!force5bytes)
+  {
+    // 8 bits will do
+    Write8(0x70 + conditionCode);
+    Write8(0);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32(0);
+  }
+  return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+  u64 fn = (u64)addr;
+  s64 distance = (s64)(fn - ((u64)code + 2));
+  if (distance < -0x80 || distance >= 0x80)
+  {
+    distance = (s64)(fn - ((u64)code + 6));
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+    Write8(0x0F);
+    Write8(0x80 + conditionCode);
+    Write32((u32)(s32)distance);
+  }
+  else
+  {
+    Write8(0x70 + conditionCode);
+    Write8((u8)(s8)distance);
+  }
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
+{
+  if (branch.type == FixupBranch::Type::Branch8Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    if (!(distance >= -0x80 && distance < 0x80))
+    {
+      printf("miauz\n");
+    }
+    ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+               "Jump target too far away, needs force5Bytes = true");
+    branch.ptr[-1] = (u8)(s8)distance;
+  }
+  else if (branch.type == FixupBranch::Type::Branch32Bit)
+  {
+    s64 distance = (s64)(code - branch.ptr);
+    ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+               "Jump target too far away, needs indirect register");
+
+    s32 valid_distance = static_cast<s32>(distance);
+    std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32));
+  }
+}
+
+// Single byte opcodes
+// There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3()
+{
+  Write8(0xCC);
+}
+void XEmitter::RET()
+{
+  Write8(0xC3);
+}
+void XEmitter::RET_FAST()
+{
+  Write8(0xF3);
+  Write8(0xC3);
+}  // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to
+   // a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+  DEBUG_ASSERT((int)size > 0);
+  while (true)
+  {
+    switch (size)
+    {
+    case 0:
+      return;
+    case 1:
+      Write8(0x90);
+      return;
+    case 2:
+      Write8(0x66);
+      Write8(0x90);
+      return;
+    case 3:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x00);
+      return;
+    case 4:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x40);
+      Write8(0x00);
+      return;
+    case 5:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 6:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x44);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 7:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x80);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 8:
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 9:
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    case 10:
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      return;
+    default:
+      // Even though x86 instructions are allowed to be up to 15 bytes long,
+      // AMD advises against using NOPs longer than 11 bytes because they
+      // carry a performance penalty on CPUs older than AMD family 16h.
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x66);
+      Write8(0x0F);
+      Write8(0x1F);
+      Write8(0x84);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      Write8(0x00);
+      size -= 11;
+      continue;
+    }
+  }
+}
+
+void XEmitter::PAUSE()
+{
+  Write8(0xF3);
+  NOP();
+}  // use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()
+{
+  CheckFlags();
+  Write8(0xF8);
+}  // clear carry
+void XEmitter::CMC()
+{
+  CheckFlags();
+  Write8(0xF5);
+}  // flip carry
+void XEmitter::STC()
+{
+  CheckFlags();
+  Write8(0xF9);
+}  // set carry
+
+// TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+  Write8(0x86);
+  Write8(0xe0);
+  // alt. 86 c4
+}
+
+// These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF()
+{
+  Write8(0x9F);
+}
+void XEmitter::SAHF()
+{
+  CheckFlags();
+  Write8(0x9E);
+}
+
+void XEmitter::PUSHF()
+{
+  Write8(0x9C);
+}
+void XEmitter::POPF()
+{
+  CheckFlags();
+  Write8(0x9D);
+}
+
+void XEmitter::LFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xE8);
+}
+void XEmitter::MFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF0);
+}
+void XEmitter::SFENCE()
+{
+  Write8(0x0F);
+  Write8(0xAE);
+  Write8(0xF8);
+}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, (int)reg >> 3);
+  Write8(byte1);
+  Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+  if (bits == 16)
+    Write8(0x66);
+  Rex(bits == 64, 0, 0, 0);
+  Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+  if (bits == 8)
+    Write8(0x66);
+  Rex(bits == 32, 0, 0, 0);
+  Write8(0x98);
+}
+
+// Simple opcodes
+
+// push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x50, reg);
+}
+void XEmitter::POP(X64Reg reg)
+{
+  WriteSimple1Byte(32, 0x58, reg);
+}
+
+void XEmitter::PUSH(int bits, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    PUSH(reg.GetSimpleReg());
+  else if (reg.IsImm())
+  {
+    switch (reg.GetImmBits())
+    {
+    case 8:
+      Write8(0x6A);
+      Write8((u8)(s8)reg.offset);
+      break;
+    case 16:
+      Write8(0x66);
+      Write8(0x68);
+      Write16((u16)(s16)(s32)reg.offset);
+      break;
+    case 32:
+      Write8(0x68);
+      Write32((u32)reg.offset);
+      break;
+    default:
+      ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits");
+      break;
+    }
+  }
+  else
+  {
+    if (bits == 16)
+      Write8(0x66);
+    reg.WriteREX(this, bits, bits);
+    Write8(0xFF);
+    reg.WriteRest(this, 0, (X64Reg)6);
+  }
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
+{
+  if (reg.IsSimpleReg())
+    POP(reg.GetSimpleReg());
+  else
+    ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+  if (bits >= 32)
+  {
+    WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+  }
+  else if (bits == 16)
+  {
+    ROL(16, R(reg), Imm8(8));
+  }
+  else if (bits == 8)
+  {
+    // Do nothing - can't bswap a single byte...
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+  }
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+  Write8(0x0F);
+  Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+  ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+  arg.operandReg = (u8)level;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0x18);
+  arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+  ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+  dest.operandReg = 0;
+  dest.WriteREX(this, 0, 8);
+  Write8(0x0F);
+  Write8(0x90 + (u8)flag);
+  dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+  ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+  if (bits == 16)
+    Write8(0x66);
+  src.operandReg = dest;
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(0x40 + (u8)flag);
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+  CheckFlags();
+  src.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  src.WriteREX(this, bits, bits, 0);
+  if (bits == 8)
+  {
+    Write8(0xF6);
+  }
+  else
+  {
+    Write8(0xF7);
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 4);
+}
+void XEmitter::DIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 6);
+}
+void XEmitter::IMUL(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 5);
+}
+void XEmitter::IDIV(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 7);
+}
+void XEmitter::NEG(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 3);
+}
+void XEmitter::NOT(int bits, const OpArg& src)
+{
+  WriteMulDivType(bits, src, 2);
+}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+  CheckFlags();
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);
+  if (rep)
+    Write8(0xF3);
+  src.WriteREX(this, bits, bits);
+  Write8(0x0F);
+  Write8(byte2);
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
+{
+  if (bits <= 16)
+    ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16");
+  WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBC);
+}  // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteBitSearchType(bits, dest, src, 0xBD);
+}  // Top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+  CheckFlags();
+  if (!cpu_info.bLZCNT)
+    PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+  WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  src.WriteREX(this, dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xBE);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xBF);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x63);
+  }
+  else
+  {
+    Crash();
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+  if (dbits == sbits)
+  {
+    MOV(dbits, R(dest), src);
+    return;
+  }
+  src.operandReg = (u8)dest;
+  if (dbits == 16)
+    Write8(0x66);
+  // the 32bit result is automatically zero extended to 64bit
+  src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits);
+  if (sbits == 8)
+  {
+    Write8(0x0F);
+    Write8(0xB6);
+  }
+  else if (sbits == 16)
+  {
+    Write8(0x0F);
+    Write8(0xB7);
+  }
+  else if (sbits == 32 && dbits == 64)
+  {
+    Write8(0x8B);
+  }
+  else
+  {
+    ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size");
+  }
+  src.WriteRest(this);
+}
+
+void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
+{
+  ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+  if (bits == 8)
+  {
+    MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg);
+    return;
+  }
+  if (bits == 16)
+    Write8(0x66);
+  ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!");
+  arg.WriteREX(this, bits, bits, reg);
+  Write8(0x0F);
+  Write8(0x38);
+  Write8(op);
+  arg.WriteRest(this, 0, reg);
+}
+void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src)
+{
+  WriteMOVBE(bits, 0xF0, dest, src);
+}
+void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src)
+{
+  WriteMOVBE(bits, 0xF1, src, dest);
+}
+
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info)
+{
+  if (info)
+  {
+    info->address = GetWritableCodePtr();
+    info->nonAtomicSwapStore = false;
+  }
+
+  switch (size)
+  {
+  case 8:
+    if (sign_extend)
+      MOVSX(32, 8, dst, src);
+    else
+      MOVZX(32, 8, dst, src);
+    break;
+  case 16:
+    MOVZX(32, 16, dst, src);
+    if (sign_extend)
+    {
+      BSWAP(32, dst);
+      SAR(32, R(dst), Imm8(16));
+    }
+    else
+    {
+      ROL(16, R(dst), Imm8(8));
+    }
+    break;
+  case 32:
+  case 64:
+    if (cpu_info.bMOVBE)
+    {
+      MOVBE(size, dst, src);
+    }
+    else
+    {
+      MOV(size, R(dst), src);
+      BSWAP(size, dst);
+    }
+    break;
+  }
+}
+
+void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info)
+{
+  if (cpu_info.bMOVBE)
+  {
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = false;
+    }
+    MOVBE(size, dst, src);
+  }
+  else
+  {
+    BSWAP(size, src);
+    if (info)
+    {
+      info->address = GetWritableCodePtr();
+      info->nonAtomicSwapStore = true;
+      info->nonAtomicSwapStoreSrc = src;
+    }
+    MOV(size, dst, R(src));
+  }
+}
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+  ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+  src.operandReg = (u8)dest;
+  if (bits == 16)
+    Write8(0x66);  // TODO: performance warning
+  src.WriteREX(this, bits, bits);
+  Write8(0x8D);
+  src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
+{
+  CheckFlags();
+  bool writeImm = false;
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument");
+  }
+  dest.operandReg = ext;
+  if (bits == 16)
+    Write8(0x66);
+  dest.WriteREX(this, bits, bits, 0);
+  if (shift.GetImmBits() == 8)
+  {
+    // ok an imm
+    u8 imm = (u8)shift.offset;
+    if (imm == 1)
+    {
+      Write8(bits == 8 ? 0xD0 : 0xD1);
+    }
+    else
+    {
+      writeImm = true;
+      Write8(bits == 8 ? 0xC0 : 0xC1);
+    }
+  }
+  else
+  {
+    Write8(bits == 8 ? 0xD2 : 0xD3);
+  }
+  dest.WriteRest(this, writeImm ? 1 : 0);
+  if (writeImm)
+    Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on Intel than AMD
+// Intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 0);
+}
+void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 1);
+}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 2);
+}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 3);
+}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 4);
+}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 5);
+}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift)
+{
+  WriteShift(bits, dest, shift, 7);
+}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms");
+  }
+  if ((index.IsImm() && index.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  if (index.IsImm())
+  {
+    dest.WriteREX(this, bits, bits);
+    Write8(0x0F);
+    Write8(0xBA);
+    dest.WriteRest(this, 1, (X64Reg)ext);
+    Write8((u8)index.offset);
+  }
+  else
+  {
+    X64Reg operand = index.GetSimpleReg();
+    dest.WriteREX(this, bits, bits, operand);
+    Write8(0x0F);
+    Write8(0x83 + 8 * ext);
+    dest.WriteRest(this, 1, operand);
+  }
+}
+
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 4);
+}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 5);
+}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 6);
+}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index)
+{
+  WriteBitTest(bits, dest, index, 7);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xAC);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xAD);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+  CheckFlags();
+  if (dest.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination");
+  }
+  if (!src.IsSimpleReg())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source");
+  }
+  if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+      (shift.IsImm() && shift.GetImmBits() != 8))
+  {
+    ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift");
+  }
+  if (bits == 16)
+    Write8(0x66);
+  X64Reg operand = src.GetSimpleReg();
+  dest.WriteREX(this, bits, bits, operand);
+  if (shift.GetImmBits() == 8)
+  {
+    Write8(0x0F);
+    Write8(0xA4);
+    dest.WriteRest(this, 1, operand);
+    Write8((u8)shift.offset);
+  }
+  else
+  {
+    Write8(0x0F);
+    Write8(0xA5);
+    dest.WriteRest(this, 0, operand);
+  }
+}
+
+void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits)
+{
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  this->operandReg = (u8)_operandReg;
+  WriteREX(emit, bits, bits);
+  emit->Write8(op);
+  WriteRest(emit);
+}
+
+// operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+                          int bits) const
+{
+  X64Reg _operandReg;
+  if (IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+  }
+
+  if (bits == 16)
+    emit->Write8(0x66);
+
+  int immToWrite = 0;
+  const NormalOpDef& op_def = normalops[static_cast<int>(op)];
+
+  if (operand.IsImm())
+  {
+    WriteREX(emit, bits, bits);
+
+    if (!toRM)
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+    }
+
+    if (operand.scale == SCALE_IMM8 && bits == 8)
+    {
+      // op al, imm8
+      if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC)
+      {
+        emit->Write8(op_def.eaximm8);
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // mov reg, imm8
+      if (!scale && op == NormalOp::MOV)
+      {
+        emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+        emit->Write8((u8)operand.offset);
+        return;
+      }
+      // op r/m8, imm8
+      emit->Write8(op_def.imm8);
+      immToWrite = 8;
+    }
+    else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+             (operand.scale == SCALE_IMM32 && bits == 32) ||
+             (operand.scale == SCALE_IMM32 && bits == 64))
+    {
+      // Try to save immediate size if we can, but first check to see
+      // if the instruction supports simm8.
+      // op r/m, imm8
+      if (op_def.simm8 != 0xCC &&
+          ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+           (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+      {
+        emit->Write8(op_def.simm8);
+        immToWrite = 8;
+      }
+      else
+      {
+        // mov reg, imm
+        if (!scale && op == NormalOp::MOV && bits != 64)
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op eax, imm
+        if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC)
+        {
+          emit->Write8(op_def.eaximm32);
+          if (bits == 16)
+            emit->Write16((u16)operand.offset);
+          else
+            emit->Write32((u32)operand.offset);
+          return;
+        }
+        // op r/m, imm
+        emit->Write8(op_def.imm32);
+        immToWrite = bits == 16 ? 16 : 32;
+      }
+    }
+    else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+             (operand.scale == SCALE_IMM8 && bits == 32) ||
+             (operand.scale == SCALE_IMM8 && bits == 64))
+    {
+      // op r/m, imm8
+      emit->Write8(op_def.simm8);
+      immToWrite = 8;
+    }
+    else if (operand.scale == SCALE_IMM64 && bits == 64)
+    {
+      if (scale)
+      {
+        ASSERT_MSG(DYNA_REC, 0,
+                   "WriteNormalOp - MOV with 64-bit imm requires register destination");
+      }
+      // mov reg64, imm64
+      else if (op == NormalOp::MOV)
+      {
+        // movabs reg64, imm64 (10 bytes)
+        if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset))
+        {
+          emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+          emit->Write64(operand.offset);
+          return;
+        }
+        // mov reg64, simm32 (7 bytes)
+        emit->Write8(op_def.imm32);
+        immToWrite = 32;
+      }
+      else
+      {
+        ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+      }
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+    }
+
+    // pass extension in REG of ModRM
+    _operandReg = static_cast<X64Reg>(op_def.ext);
+  }
+  else
+  {
+    _operandReg = (X64Reg)operand.offsetOrBaseReg;
+    WriteREX(emit, bits, bits, _operandReg);
+    // op r/m, reg
+    if (toRM)
+    {
+      emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32);
+    }
+    // op reg, r/m
+    else
+    {
+      emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32);
+    }
+  }
+  WriteRest(emit, immToWrite >> 3, _operandReg);
+  switch (immToWrite)
+  {
+  case 0:
+    break;
+  case 8:
+    emit->Write8((u8)operand.offset);
+    break;
+  case 16:
+    emit->Write16((u16)operand.offset);
+    break;
+  case 32:
+    emit->Write32((u32)operand.offset);
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+  }
+}
+
+void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
+{
+  if (a1.IsImm())
+  {
+    // Booh! Can't write to an imm
+    ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+    return;
+  }
+  if (a2.IsImm())
+  {
+    a1.WriteNormalOp(this, true, op, a2, bits);
+  }
+  else
+  {
+    if (a1.IsSimpleReg())
+    {
+      a2.WriteNormalOp(this, false, op, a1, bits);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(),
+                 "WriteNormalOp - a1 and a2 cannot both be memory");
+      a1.WriteNormalOp(this, true, op, a2, bits);
+    }
+  }
+}
+
+void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADD, a1, a2);
+}
+void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::ADC, a1, a2);
+}
+void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SUB, a1, a2);
+}
+void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::SBB, a1, a2);
+}
+void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::AND, a1, a2);
+}
+void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::OR, a1, a2);
+}
+void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::XOR, a1, a2);
+}
+void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2)
+{
+  if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 &&
+      a2.offset == static_cast<u32>(a2.offset))
+  {
+    WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32());
+    return;
+  }
+  if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+  {
+    ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+  }
+  WriteNormalOp(bits, NormalOp::MOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::TEST, a1, a2);
+}
+void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2)
+{
+  WriteNormalOp(bits, NormalOp::XCHG, a1, a2);
+}
+void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (a1.IsSimpleReg() && a2.IsZero())  // turn 'CMP reg, 0' into shorter 'TEST reg, reg'
+  {
+    WriteNormalOp(bits, NormalOp::TEST, a1, a1);
+  }
+  else
+  {
+    WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+  }
+}
+
+void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2)
+{
+  // This stomps on flags, so ensure they aren't locked
+  DEBUG_ASSERT(!flags_locked);
+
+  // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero
+  // or a2 == dest && a1 == zero)
+  if (a1.IsZero())
+  {
+    if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a2);
+    }
+    return;
+  }
+  if (a2.IsZero())
+  {
+    if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest)
+    {
+      MOV(bits, R(dest), a1);
+    }
+    return;
+  }
+
+  // If dest == a1 or dest == a2 we can simplify this
+  if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a2);
+    return;
+  }
+
+  if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest)
+  {
+    ADD(bits, R(dest), a1);
+    return;
+  }
+
+  // TODO: 32-bit optimizations may apply to other bit sizes (confirm)
+  if (bits == 32)
+  {
+    if (a1.IsImm() && a2.IsImm())
+    {
+      MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg()));
+      return;
+    }
+
+    if (a1.IsSimpleReg() && a2.IsImm())
+    {
+      LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32()));
+      return;
+    }
+
+    if (a1.IsImm() && a2.IsSimpleReg())
+    {
+      LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32()));
+      return;
+    }
+  }
+
+  // Fallback
+  MOV(bits, R(dest), a1);
+  ADD(bits, R(dest), a2);
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a1.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+    return;
+  }
+
+  if (!a2.IsImm())
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!");
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a1.WriteREX(this, bits, bits, regOp);
+
+  if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+      (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+  {
+    Write8(0x6B);
+    a1.WriteRest(this, 1, regOp);
+    Write8((u8)a2.offset);
+  }
+  else
+  {
+    Write8(0x69);
+    if (a2.GetImmBits() == 16 && bits == 16)
+    {
+      a1.WriteRest(this, 2, regOp);
+      Write16((u16)a2.offset);
+    }
+    else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+    {
+      a1.WriteRest(this, 4, regOp);
+      Write32((u32)a2.offset);
+    }
+    else
+    {
+      ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!");
+    }
+  }
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
+{
+  CheckFlags();
+  if (bits == 8)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+    return;
+  }
+
+  if (a.IsImm())
+  {
+    IMUL(bits, regOp, R(regOp), a);
+    return;
+  }
+
+  if (bits == 16)
+    Write8(0x66);
+  a.WriteREX(this, bits, bits, regOp);
+  Write8(0x0F);
+  Write8(0xAF);
+  a.WriteRest(this, 0, regOp);
+}
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+  if (opPrefix)
+    Write8(opPrefix);
+  arg.operandReg = regOp;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  if (op > 0xFF)
+    Write8((op >> 8) & 0xFF);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+  // Currently, only 0x38 and 0x3A are used as secondary escape byte.
+  if ((op >> 8) == 0x3A)
+    return 3;
+  else if ((op >> 8) == 0x38)
+    return 2;
+  else
+    return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+  if (opPrefix == 0x66)
+    return 1;
+  else if (opPrefix == 0xF3)
+    return 2;
+  else if (opPrefix == 0xF2)
+    return 3;
+  else
+    return 0;
+}
+
+void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  int mmmmm = GetVEXmmmmm(op);
+  int pp = GetVEXpp(opPrefix);
+  // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+  arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
+  Write8(op & 0xFF);
+  arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1);
+  Write8((u8)regOp3 << 4);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                          int W, int extrabytes)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           X64Reg regOp3, int W)
+{
+  if (!cpu_info.bAVX)
+    PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+  WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
+}
+
+void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
+{
+  if (!cpu_info.bFMA)
+    PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
+}
+
+void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                           int W)
+{
+  if (!cpu_info.bFMA4)
+    PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
+  WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
+}
+
+void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                          const OpArg& arg, int extrabytes)
+{
+  if (arg.IsImm())
+    PanicAlert("BMI1/2 instructions don't support immediate operands.");
+  if (size != 32 && size != 64)
+    PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!");
+  int W = size == 64;
+  WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  CheckFlags();
+  if (!cpu_info.bBMI1)
+    PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+                           const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bBMI2)
+    PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+  WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6E, dest, arg, 0);
+}
+void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src)
+{
+  WriteSSEOp(0x66, 0x7E, src, arg, 0);
+}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+  // Alternate encoding
+  // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+  arg.operandReg = dest;
+  Write8(0x66);
+  arg.WriteREX(this, 64, 0);
+  Write8(0x0f);
+  Write8(0x6E);
+  arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+  if (src > 7 || arg.IsSimpleReg())
+  {
+    // Alternate encoding
+    // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+    arg.operandReg = src;
+    Write8(0x66);
+    arg.WriteREX(this, 64, 0);
+    Write8(0x0f);
+    Write8(0x7E);
+    arg.WriteRest(this, 0);
+  }
+  else
+  {
+    arg.operandReg = src;
+    arg.WriteREX(this, 0, 0);
+    Write8(0x66);
+    Write8(0x0f);
+    Write8(0xD6);
+    arg.WriteRest(this, 0);
+  }
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+  if (arg.IsImm() || arg.IsSimpleReg())
+    ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand");
+
+  arg.operandReg = ext;
+  arg.WriteREX(this, 0, 0);
+  Write8(0x0F);
+  Write8(0xAE);
+  arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 3);
+}
+void XEmitter::LDMXCSR(const OpArg& memloc)
+{
+  WriteMXCSR(memloc, 2);
+}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);
+}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVNTP, regOp, arg);
+}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVNTP, regOp, arg);
+}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseADD, regOp, arg);
+}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseADD, regOp, arg);
+}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSUB, regOp, arg);
+}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSUB, regOp, arg);
+}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF3, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0xF2, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMUL, regOp, arg);
+}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMUL, regOp, arg);
+}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseDIV, regOp, arg);
+}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseDIV, regOp, arg);
+}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMIN, regOp, arg);
+}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMIN, regOp, arg);
+}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMAX, regOp, arg);
+}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseRSQRT, regOp, arg);
+}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseADD, regOp, arg);
+}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseADD, regOp, arg);
+}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSUB, regOp, arg);
+}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSUB, regOp, arg);
+}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x00, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+  WriteSSEOp(0x66, sseCMP, regOp, arg, 1);
+  Write8(compare);
+}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseAND, regOp, arg);
+}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseAND, regOp, arg);
+}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseANDN, regOp, arg);
+}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseANDN, regOp, arg);
+}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseOR, regOp, arg);
+}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseOR, regOp, arg);
+}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseXOR, regOp, arg);
+}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseXOR, regOp, arg);
+}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMUL, regOp, arg);
+}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMUL, regOp, arg);
+}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseDIV, regOp, arg);
+}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseDIV, regOp, arg);
+}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMIN, regOp, arg);
+}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMIN, regOp, arg);
+}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMAX, regOp, arg);
+}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseRSQRT, regOp, arg);
+}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x00, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, sseSHUF, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseCOMIS, regOp, arg);
+}  // weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseCOMIS, regOp, arg);
+}  // ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseUCOMIS, regOp, arg);
+}  // unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseUCOMIS, regOp, arg);
+}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);
+}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);
+}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);
+}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg);
+}
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg);
+}
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp)
+{
+  WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));
+}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2)
+{
+  WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));
+}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5A, regOp, arg);
+}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5A, regOp, arg);
+}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2A, regOp, arg);
+}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2A, regOp, arg);
+}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0xE6, regOp, arg);
+}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x5B, regOp, arg);
+}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0xE6, regOp, arg);
+}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x5B, regOp, arg);
+}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0xF3, 0x5B, regOp, arg);
+}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE6, regOp, arg);
+}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)
+{
+  WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));
+}
+
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x50, dest, arg);
+}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x50, dest, arg);
+}
+
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0xF2, sseLDDQU, dest, arg);
+}  // For integer data only
+
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x00, 0x15, dest, arg);
+}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x15, dest, arg);
+}
+
+// Pretty much every x86 CPU nowadays supports SSE3,
+// but the SSE2 fallbacks are easy.
+void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKLPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF3, 0x16, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVAPD(regOp, arg);
+    UNPCKHPS(regOp, R(regOp));
+  }
+}
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
+{
+  if (cpu_info.bSSE3)
+  {
+    WriteSSEOp(0xF2, 0x12, regOp, arg);
+  }
+  else
+  {
+    if (!arg.IsSimpleReg(regOp))
+      MOVSD(regOp, arg);
+    UNPCKLPD(regOp, R(regOp));
+  }
+}
+
+// There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6B, dest, arg);
+}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x63, dest, arg);
+}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x67, dest, arg);
+}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x60, dest, arg);
+}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x61, dest, arg);
+}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x62, dest, arg);
+}
+void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x6C, dest, arg);
+}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+  Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+  WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAW-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x71);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+  if (reg > 7)
+    PanicAlert("The PSRAD-emitter does not support regs above 7");
+  Write8(0x66);
+  Write8(0x0f);
+  Write8(0x72);
+  Write8(0xE0 | reg);
+  Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSSE3)
+    PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+  if (!cpu_info.bSSE4_1)
+    PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+  WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSSE3Op(0x66, 0x3800, dest, arg);
+}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3817, dest, arg);
+}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x382b, dest, arg);
+}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3820, dest, arg);
+}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3821, dest, arg);
+}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3822, dest, arg);
+}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3823, dest, arg);
+}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3824, dest, arg);
+}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3825, dest, arg);
+}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3830, dest, arg);
+}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3831, dest, arg);
+}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3832, dest, arg);
+}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3833, dest, arg);
+}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3834, dest, arg);
+}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3835, dest, arg);
+}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3810, dest, arg);
+}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3814, dest, arg);
+}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3815, dest, arg);
+}
+void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1);
+  Write8(blend);
+}
+void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
+{
+  WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1);
+  Write8(blend);
+}
+
+void XEmitter::PAND(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDB, dest, arg);
+}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDF, dest, arg);
+}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEF, dest, arg);
+}
+void XEmitter::POR(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEB, dest, arg);
+}
+
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFC, dest, arg);
+}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFD, dest, arg);
+}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFE, dest, arg);
+}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD4, dest, arg);
+}
+
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEC, dest, arg);
+}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xED, dest, arg);
+}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDC, dest, arg);
+}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDD, dest, arg);
+}
+
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF8, dest, arg);
+}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF9, dest, arg);
+}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFA, dest, arg);
+}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xFB, dest, arg);
+}
+
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE8, dest, arg);
+}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE9, dest, arg);
+}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD8, dest, arg);
+}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD9, dest, arg);
+}
+
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE0, dest, arg);
+}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xE3, dest, arg);
+}
+
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x74, dest, arg);
+}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x75, dest, arg);
+}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x76, dest, arg);
+}
+
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x64, dest, arg);
+}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x65, dest, arg);
+}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0x66, dest, arg);
+}
+
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC5, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSEOp(0x66, 0xC4, dest, arg);
+  Write8(subreg);
+}
+void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+  WriteSSE41Op(0x66, 0x3A22, dest, arg);
+  Write8(subreg);
+}
+
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF5, dest, arg);
+}
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xF6, dest, arg);
+}
+
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEE, dest, arg);
+}
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDE, dest, arg);
+}
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xEA, dest, arg);
+}
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xDA, dest, arg);
+}
+
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg)
+{
+  WriteSSEOp(0x66, 0xD7, dest, arg);
+}
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0x66, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF2, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+  WriteSSEOp(0xF3, 0x70, regOp, arg, 1);
+  Write8(shuffle);
+}
+
+// VEX
+void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);
+}
+void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare)
+{
+  WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1);
+  Write8(compare);
+}
+void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+  WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1);
+  Write8(shuffle);
+}
+void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);
+}
+void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3)
+{
+  WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3);
+}
+void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+  WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1);
+  Write8(blend);
+}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);
+}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);
+}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);
+}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);
+}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
+}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x98, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x99, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x96, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0x97, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);
+}
+
+#define FMA4(name, op)                                                                             \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1);                                                 \
+  }                                                                                                \
+  void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)                 \
+  {                                                                                                \
+    WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0);                                                 \
+  }
+
+FMA4(VFMADDSUBPS, 0x5C)
+FMA4(VFMADDSUBPD, 0x5D)
+FMA4(VFMSUBADDPS, 0x5E)
+FMA4(VFMSUBADDPD, 0x5F)
+FMA4(VFMADDPS, 0x68)
+FMA4(VFMADDPD, 0x69)
+FMA4(VFMADDSS, 0x6A)
+FMA4(VFMADDSD, 0x6B)
+FMA4(VFMSUBPS, 0x6C)
+FMA4(VFMSUBPD, 0x6D)
+FMA4(VFMSUBSS, 0x6E)
+FMA4(VFMSUBSD, 0x6F)
+FMA4(VFNMADDPS, 0x78)
+FMA4(VFNMADDPD, 0x79)
+FMA4(VFNMADDSS, 0x7A)
+FMA4(VFNMADDSD, 0x7B)
+FMA4(VFNMSUBPS, 0x7C)
+FMA4(VFNMSUBPD, 0x7D)
+FMA4(VFNMSUBSS, 0x7E)
+FMA4(VFNMSUBSD, 0x7F)
+#undef FMA4
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate)
+{
+  WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1);
+  Write8(rotate);
+}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);
+}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  CheckFlags();
+  WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);
+}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);
+}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);
+}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+  WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);
+}
+
+// Prefixes
+
+void XEmitter::LOCK()
+{
+  Write8(0xF0);
+}
+void XEmitter::REP()
+{
+  Write8(0xF3);
+}
+void XEmitter::REPNE()
+{
+  Write8(0xF2);
+}
+void XEmitter::FSOverride()
+{
+  Write8(0x64);
+}
+void XEmitter::GSOverride()
+{
+  Write8(0x65);
+}
+
+void XEmitter::FWAIT()
+{
+  Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
+{
+  int mf = 0;
+  ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid),
+             "WriteFloatLoadStore: 80 bits not supported for this instruction");
+  switch (bits)
+  {
+  case 32:
+    mf = 0;
+    break;
+  case 64:
+    mf = 4;
+    break;
+  case 80:
+    mf = 2;
+    break;
+  default:
+    ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+  }
+  Write8(0xd9 | mf);
+  // x87 instructions use the reg field of the ModR/M byte as opcode:
+  if (bits == 80)
+    op = op_80b;
+  arg.WriteRest(this, 0, static_cast<X64Reg>(op));
+}
+
+void XEmitter::FLD(int bits, const OpArg& src)
+{
+  WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src);
+}
+void XEmitter::FST(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest);
+}
+void XEmitter::FSTP(int bits, const OpArg& dest)
+{
+  WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest);
+}
+void XEmitter::FNSTSW_AX()
+{
+  Write8(0xDF);
+  Write8(0xE0);
+}
+
+void XEmitter::RDTSC()
+{
+  Write8(0x0F);
+  Write8(0x31);
+}
+}
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
new file mode 100644
index 0000000..869acb6
--- /dev/null
+++ b/src/dolphin/x64Emitter.h
@@ -0,0 +1,1169 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <type_traits>
+
+#include "Compat.h"
+#include "BitSet.h"
+#include "../types.h"
+#include "x64ABI.h"
+
+namespace Gen
+{
+enum CCFlags
+{
+  CC_O = 0,
+  CC_NO = 1,
+  CC_B = 2,
+  CC_C = 2,
+  CC_NAE = 2,
+  CC_NB = 3,
+  CC_NC = 3,
+  CC_AE = 3,
+  CC_Z = 4,
+  CC_E = 4,
+  CC_NZ = 5,
+  CC_NE = 5,
+  CC_BE = 6,
+  CC_NA = 6,
+  CC_NBE = 7,
+  CC_A = 7,
+  CC_S = 8,
+  CC_NS = 9,
+  CC_P = 0xA,
+  CC_PE = 0xA,
+  CC_NP = 0xB,
+  CC_PO = 0xB,
+  CC_L = 0xC,
+  CC_NGE = 0xC,
+  CC_NL = 0xD,
+  CC_GE = 0xD,
+  CC_LE = 0xE,
+  CC_NG = 0xE,
+  CC_NLE = 0xF,
+  CC_G = 0xF
+};
+
+enum
+{
+  NUMGPRs = 16,
+  NUMXMMs = 16,
+};
+
+enum
+{
+  SCALE_NONE = 0,
+  SCALE_1 = 1,
+  SCALE_2 = 2,
+  SCALE_4 = 4,
+  SCALE_8 = 8,
+  SCALE_ATREG = 16,
+  // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+  SCALE_NOBASE_2 = 34,
+  SCALE_NOBASE_4 = 36,
+  SCALE_NOBASE_8 = 40,
+  SCALE_RIP = 0xFF,
+  SCALE_IMM8 = 0xF0,
+  SCALE_IMM16 = 0xF1,
+  SCALE_IMM32 = 0xF2,
+  SCALE_IMM64 = 0xF3,
+};
+
+enum SSECompare
+{
+  CMP_EQ = 0,
+  CMP_LT = 1,
+  CMP_LE = 2,
+  CMP_UNORD = 3,
+  CMP_NEQ = 4,
+  CMP_NLT = 5,
+  CMP_NLE = 6,
+  CMP_ORD = 7,
+};
+
+class XEmitter;
+enum class FloatOp;
+enum class NormalOp;
+
+// Information about a generated MOV op
+struct MovInfo final
+{
+  u8* address;
+  bool nonAtomicSwapStore;
+  // valid iff nonAtomicSwapStore is true
+  X64Reg nonAtomicSwapStoreSrc;
+};
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+  // For accessing offset and operandReg.
+  // This also allows us to keep the op writing functions private.
+  friend class XEmitter;
+
+  // dummy op arg, used for storage
+  constexpr OpArg() = default;
+  constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX)
+      : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)},
+        indexReg{static_cast<u16>(scaled_reg)}, offset{offset_}
+  {
+  }
+  constexpr bool operator==(const OpArg& b) const
+  {
+    // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately,
+    // (because we still support some older versions of GCC where std::tie is not constexpr.)
+    return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+           indexReg == b.indexReg && offset == b.offset;
+  }
+  constexpr bool operator!=(const OpArg& b) const { return !operator==(b); }
+  u64 Imm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (u64)offset;
+  }
+  u32 Imm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (u32)offset;
+  }
+  u16 Imm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (u16)offset;
+  }
+  u8 Imm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (u8)offset;
+  }
+
+  s64 SImm64() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM64);
+    return (s64)offset;
+  }
+  s32 SImm32() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM32);
+    return (s32)offset;
+  }
+  s16 SImm16() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM16);
+    return (s16)offset;
+  }
+  s8 SImm8() const
+  {
+    DEBUG_ASSERT(scale == SCALE_IMM8);
+    return (s8)offset;
+  }
+
+  OpArg AsImm64() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u64)offset, SCALE_IMM64);
+  }
+  OpArg AsImm32() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u32)offset, SCALE_IMM32);
+  }
+  OpArg AsImm16() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u16)offset, SCALE_IMM16);
+  }
+  OpArg AsImm8() const
+  {
+    DEBUG_ASSERT(IsImm());
+    return OpArg((u8)offset, SCALE_IMM8);
+  }
+
+  constexpr bool IsImm() const
+  {
+    return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+           scale == SCALE_IMM64;
+  }
+  constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+  constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; }
+  constexpr bool IsZero() const { return IsImm() && offset == 0; }
+  constexpr int GetImmBits() const
+  {
+    switch (scale)
+    {
+    case SCALE_IMM8:
+      return 8;
+    case SCALE_IMM16:
+      return 16;
+    case SCALE_IMM32:
+      return 32;
+    case SCALE_IMM64:
+      return 64;
+    default:
+      return -1;
+    }
+  }
+
+  constexpr X64Reg GetSimpleReg() const
+  {
+    if (scale == SCALE_NONE)
+      return static_cast<X64Reg>(offsetOrBaseReg);
+
+    return INVALID_REG;
+  }
+
+  void AddMemOffset(int val)
+  {
+    DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE),
+                     "Tried to increment an OpArg which doesn't have an offset");
+    offset += val;
+  }
+
+private:
+  void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+  void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+                int W = 0) const;
+  void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+                 bool warn_64bit_offset = true) const;
+  void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+  void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const;
+
+  u8 scale = 0;
+  u16 offsetOrBaseReg = 0;
+  u16 indexReg = 0;
+  u64 offset = 0;  // Also used to store immediates.
+  u16 operandReg = 0;
+};
+
+template <typename T>
+inline OpArg M(const T* ptr)
+{
+  return OpArg((u64)(const void*)ptr, (int)SCALE_RIP);
+}
+constexpr OpArg R(X64Reg value)
+{
+  return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value)
+{
+  return OpArg(0, SCALE_ATREG, value);
+}
+
+constexpr OpArg MDisp(X64Reg value, int offset)
+{
+  return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
+}
+
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+  return OpArg(offset, scale, base, scaled);
+}
+
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+  if (scale == SCALE_1)
+    return OpArg(offset, SCALE_ATREG, scaled);
+
+  return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+  return MComplex(base, offset, 1, 0);
+}
+
+constexpr OpArg Imm8(u8 imm)
+{
+  return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm)
+{
+  return OpArg(imm, SCALE_IMM16);
+}  // rarely used
+constexpr OpArg Imm32(u32 imm)
+{
+  return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm)
+{
+  return OpArg(imm, SCALE_IMM64);
+}
+inline OpArg ImmPtr(const void* imm)
+{
+  return Imm64(reinterpret_cast<u64>(imm));
+}
+
+inline u32 PtrOffset(const void* ptr, const void* base = nullptr)
+{
+  s64 distance = (s64)ptr - (s64)base;
+  if (distance >= 0x80000000LL || distance < -0x80000000LL)
+  {
+    ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range");
+    return 0;
+  }
+
+  return (u32)distance;
+}
+
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
+
+struct FixupBranch
+{
+  enum class Type
+  {
+    Branch8Bit,
+    Branch32Bit
+  };
+
+  u8* ptr;
+  Type type;
+};
+
+class XEmitter
+{
+  friend struct OpArg;  // for Write8 etc
+private:
+  u8* code = nullptr;
+  bool flags_locked = false;
+
+  void CheckFlags();
+
+  void Rex(int w, int r, int x, int b);
+  void WriteModRM(int mod, int reg, int rm);
+  void WriteSIB(int scale, int index, int base);
+  void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+  void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+  void WriteMulDivType(int bits, OpArg src, int ext);
+  void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+  void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+  void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
+  void WriteMXCSR(OpArg arg, int ext);
+  void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+  void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+  void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+                  int extrabytes = 0);
+  void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   X64Reg regOp3, int W = 0);
+  void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+  void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                  int extrabytes = 0);
+  void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                   int extrabytes = 0);
+  void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg);
+  void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+  void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+
+  void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                              size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+  void Write8(u8 value);
+  void Write16(u16 value);
+  void Write32(u32 value);
+  void Write64(u64 value);
+
+public:
+  XEmitter() = default;
+  explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
+  virtual ~XEmitter() = default;
+  void SetCodePtr(u8* ptr);
+  void ReserveCodeSpace(int bytes);
+  u8* AlignCodeTo(size_t alignment);
+  u8* AlignCode4();
+  u8* AlignCode16();
+  u8* AlignCodePage();
+  const u8* GetCodePtr() const;
+  u8* GetWritableCodePtr();
+
+  void LockFlags() { flags_locked = true; }
+  void UnlockFlags() { flags_locked = false; }
+  // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+  // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+  // string instr.,
+  // INC and DEC are slow on Intel Core, but not on AMD. They create a
+  // false flag dependency because they only update a subset of the flags.
+  // XCHG is SLOW and should be avoided.
+
+  // Debug breakpoint
+  void INT3();
+
+  // Do nothing
+  void NOP(size_t count = 1);
+
+  // Save energy in wait-loops on P4 only. Probably not too useful.
+  void PAUSE();
+
+  // Flag control
+  void STC();
+  void CLC();
+  void CMC();
+
+  // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+  // AMD!
+  void LAHF();  // 3 cycle vector path
+  void SAHF();  // direct path fast
+
+  // Stack control
+  void PUSH(X64Reg reg);
+  void POP(X64Reg reg);
+  void PUSH(int bits, const OpArg& reg);
+  void POP(int bits, const OpArg& reg);
+  void PUSHF();
+  void POPF();
+
+  // Flow control
+  void RET();
+  void RET_FAST();
+  void UD2();
+  FixupBranch J(bool force5bytes = false);
+
+  void JMP(const u8* addr, bool force5Bytes = false);
+  void JMPptr(const OpArg& arg);
+  void JMPself();  // infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+  void CALL(const void* fnptr);
+  FixupBranch CALL();
+  void CALLptr(OpArg arg);
+
+  FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+  void J_CC(CCFlags conditionCode, const u8* addr);
+
+  void SetJumpTarget(const FixupBranch& branch);
+
+  void SETcc(CCFlags flag, OpArg dest);
+  // Note: CMOV brings small if any benefit on current CPUs.
+  void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+  // Fences
+  void LFENCE();
+  void MFENCE();
+  void SFENCE();
+
+  // Bit scan
+  void BSF(int bits, X64Reg dest, const OpArg& src);  // Bottom bit to top bit
+  void BSR(int bits, X64Reg dest, const OpArg& src);  // Top bit to bottom bit
+
+  // Cache control
+  enum PrefetchLevel
+  {
+    PF_NTA,  // Non-temporal (data used once and only once)
+    PF_T0,   // All cache levels
+    PF_T1,   // Levels 2+ (aliased to T0 on AMD)
+    PF_T2,   // Levels 3+ (aliased to T0 on AMD)
+  };
+  void PREFETCH(PrefetchLevel level, OpArg arg);
+  void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+  void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+  void MOVNTPS(const OpArg& arg, X64Reg regOp);
+  void MOVNTPD(const OpArg& arg, X64Reg regOp);
+
+  // Multiplication / division
+  void MUL(int bits, const OpArg& src);   // UNSIGNED
+  void IMUL(int bits, const OpArg& src);  // SIGNED
+  void IMUL(int bits, X64Reg regOp, const OpArg& src);
+  void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+  void DIV(int bits, const OpArg& src);
+  void IDIV(int bits, const OpArg& src);
+
+  // Shift
+  void ROL(int bits, const OpArg& dest, const OpArg& shift);
+  void ROR_(int bits, const OpArg& dest, const OpArg& shift);
+  void RCL(int bits, const OpArg& dest, const OpArg& shift);
+  void RCR(int bits, const OpArg& dest, const OpArg& shift);
+  void SHL(int bits, const OpArg& dest, const OpArg& shift);
+  void SHR(int bits, const OpArg& dest, const OpArg& shift);
+  void SAR(int bits, const OpArg& dest, const OpArg& shift);
+
+  // Bit Test
+  void BT(int bits, const OpArg& dest, const OpArg& index);
+  void BTS(int bits, const OpArg& dest, const OpArg& index);
+  void BTR(int bits, const OpArg& dest, const OpArg& index);
+  void BTC(int bits, const OpArg& dest, const OpArg& index);
+
+  // Double-Precision Shift
+  void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+  void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+
+  // Extend EAX into EDX in various ways
+  void CWD(int bits = 16);
+  inline void CDQ() { CWD(32); }
+  inline void CQO() { CWD(64); }
+  void CBW(int bits = 8);
+  inline void CWDE() { CBW(16); }
+  inline void CDQE() { CBW(32); }
+  // Load effective address
+  void LEA(int bits, X64Reg dest, OpArg src);
+
+  // Integer arithmetic
+  void NEG(int bits, const OpArg& src);
+  void ADD(int bits, const OpArg& a1, const OpArg& a2);
+  void ADC(int bits, const OpArg& a1, const OpArg& a2);
+  void SUB(int bits, const OpArg& a1, const OpArg& a2);
+  void SBB(int bits, const OpArg& a1, const OpArg& a2);
+  void AND(int bits, const OpArg& a1, const OpArg& a2);
+  void CMP(int bits, const OpArg& a1, const OpArg& a2);
+
+  // Bit operations
+  void NOT(int bits, const OpArg& src);
+  void OR(int bits, const OpArg& a1, const OpArg& a2);
+  void XOR(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV(int bits, const OpArg& a1, const OpArg& a2);
+  void TEST(int bits, const OpArg& a1, const OpArg& a2);
+
+  void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2);
+  void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2);
+
+  // Are these useful at all? Consider removing.
+  void XCHG(int bits, const OpArg& a1, const OpArg& a2);
+  void XCHG_AHAL();
+
+  // Byte swapping (32 and 64-bit only).
+  void BSWAP(int bits, X64Reg reg);
+
+  // Sign/zero extension
+  void MOVSX(int dbits, int sbits, X64Reg dest,
+             OpArg src);  // automatically uses MOVSXD if necessary
+  void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+  // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+  void MOVBE(int bits, X64Reg dest, const OpArg& src);
+  void MOVBE(int bits, const OpArg& dest, X64Reg src);
+  void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false,
+                   MovInfo* info = nullptr);
+  void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr);
+
+  // Available only on AMD >= Phenom or Intel >= Haswell
+  void LZCNT(int bits, X64Reg dest, const OpArg& src);
+  // Note: this one is actually part of BMI1
+  void TZCNT(int bits, X64Reg dest, const OpArg& src);
+
+  // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+  void STMXCSR(const OpArg& memloc);
+  void LDMXCSR(const OpArg& memloc);
+
+  // Prefixes
+  void LOCK();
+  void REP();
+  void REPNE();
+  void FSOverride();
+  void GSOverride();
+
+  // x87
+  enum x87StatusWordBits
+  {
+    x87_InvalidOperation = 0x1,
+    x87_DenormalizedOperand = 0x2,
+    x87_DivisionByZero = 0x4,
+    x87_Overflow = 0x8,
+    x87_Underflow = 0x10,
+    x87_Precision = 0x20,
+    x87_StackFault = 0x40,
+    x87_ErrorSummary = 0x80,
+    x87_C0 = 0x100,
+    x87_C1 = 0x200,
+    x87_C2 = 0x400,
+    x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+    x87_C3 = 0x4000,
+    x87_FPUBusy = 0x8000,
+  };
+
+  void FLD(int bits, const OpArg& src);
+  void FST(int bits, const OpArg& dest);
+  void FSTP(int bits, const OpArg& dest);
+  void FNSTSW_AX();
+  void FWAIT();
+
+  // SSE/SSE2: Floating point arithmetic
+  void ADDSS(X64Reg regOp, const OpArg& arg);
+  void ADDSD(X64Reg regOp, const OpArg& arg);
+  void SUBSS(X64Reg regOp, const OpArg& arg);
+  void SUBSD(X64Reg regOp, const OpArg& arg);
+  void MULSS(X64Reg regOp, const OpArg& arg);
+  void MULSD(X64Reg regOp, const OpArg& arg);
+  void DIVSS(X64Reg regOp, const OpArg& arg);
+  void DIVSD(X64Reg regOp, const OpArg& arg);
+  void MINSS(X64Reg regOp, const OpArg& arg);
+  void MINSD(X64Reg regOp, const OpArg& arg);
+  void MAXSS(X64Reg regOp, const OpArg& arg);
+  void MAXSD(X64Reg regOp, const OpArg& arg);
+  void SQRTSS(X64Reg regOp, const OpArg& arg);
+  void SQRTSD(X64Reg regOp, const OpArg& arg);
+  void RCPSS(X64Reg regOp, const OpArg& arg);
+  void RSQRTSS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point bitwise (yes)
+  void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
+
+  // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+  void ADDPS(X64Reg regOp, const OpArg& arg);
+  void ADDPD(X64Reg regOp, const OpArg& arg);
+  void SUBPS(X64Reg regOp, const OpArg& arg);
+  void SUBPD(X64Reg regOp, const OpArg& arg);
+  void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+  void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+  void MULPS(X64Reg regOp, const OpArg& arg);
+  void MULPD(X64Reg regOp, const OpArg& arg);
+  void DIVPS(X64Reg regOp, const OpArg& arg);
+  void DIVPD(X64Reg regOp, const OpArg& arg);
+  void MINPS(X64Reg regOp, const OpArg& arg);
+  void MINPD(X64Reg regOp, const OpArg& arg);
+  void MAXPS(X64Reg regOp, const OpArg& arg);
+  void MAXPD(X64Reg regOp, const OpArg& arg);
+  void SQRTPS(X64Reg regOp, const OpArg& arg);
+  void SQRTPD(X64Reg regOp, const OpArg& arg);
+  void RCPPS(X64Reg regOp, const OpArg& arg);
+  void RSQRTPS(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+  void ANDPS(X64Reg regOp, const OpArg& arg);
+  void ANDPD(X64Reg regOp, const OpArg& arg);
+  void ANDNPS(X64Reg regOp, const OpArg& arg);
+  void ANDNPD(X64Reg regOp, const OpArg& arg);
+  void ORPS(X64Reg regOp, const OpArg& arg);
+  void ORPD(X64Reg regOp, const OpArg& arg);
+  void XORPS(X64Reg regOp, const OpArg& arg);
+  void XORPD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+  void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+  void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
+
+  // SSE3
+  void MOVSLDUP(X64Reg regOp, const OpArg& arg);
+  void MOVSHDUP(X64Reg regOp, const OpArg& arg);
+  void MOVDDUP(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Useful alternative to shuffle in some cases.
+  void UNPCKLPS(X64Reg dest, const OpArg& src);
+  void UNPCKHPS(X64Reg dest, const OpArg& src);
+  void UNPCKLPD(X64Reg dest, const OpArg& src);
+  void UNPCKHPD(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Compares.
+  void COMISS(X64Reg regOp, const OpArg& arg);
+  void COMISD(X64Reg regOp, const OpArg& arg);
+  void UCOMISS(X64Reg regOp, const OpArg& arg);
+  void UCOMISD(X64Reg regOp, const OpArg& arg);
+
+  // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+  void MOVAPS(X64Reg regOp, const OpArg& arg);
+  void MOVAPD(X64Reg regOp, const OpArg& arg);
+  void MOVAPS(const OpArg& arg, X64Reg regOp);
+  void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVUPS(X64Reg regOp, const OpArg& arg);
+  void MOVUPD(X64Reg regOp, const OpArg& arg);
+  void MOVUPS(const OpArg& arg, X64Reg regOp);
+  void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVDQA(X64Reg regOp, const OpArg& arg);
+  void MOVDQA(const OpArg& arg, X64Reg regOp);
+  void MOVDQU(X64Reg regOp, const OpArg& arg);
+  void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+  void MOVSS(X64Reg regOp, const OpArg& arg);
+  void MOVSD(X64Reg regOp, const OpArg& arg);
+  void MOVSS(const OpArg& arg, X64Reg regOp);
+  void MOVSD(const OpArg& arg, X64Reg regOp);
+
+  void MOVLPS(X64Reg regOp, const OpArg& arg);
+  void MOVLPD(X64Reg regOp, const OpArg& arg);
+  void MOVLPS(const OpArg& arg, X64Reg regOp);
+  void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHPS(X64Reg regOp, const OpArg& arg);
+  void MOVHPD(X64Reg regOp, const OpArg& arg);
+  void MOVHPS(const OpArg& arg, X64Reg regOp);
+  void MOVHPD(const OpArg& arg, X64Reg regOp);
+
+  void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+  void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+  // Be careful when using these overloads for reg <--> xmm moves.
+  // The one you cast to OpArg with R(reg) is the x86 reg, the other
+  // one is the xmm reg.
+  // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx)
+  //     use "MOVD_xmm(R(eax), xmm1)" instead.
+  void MOVD_xmm(X64Reg dest, const OpArg& arg);
+  void MOVQ_xmm(X64Reg dest, OpArg arg);
+  void MOVD_xmm(const OpArg& arg, X64Reg src);
+  void MOVQ_xmm(OpArg arg, X64Reg src);
+
+  // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+  // question.
+  void MOVMSKPS(X64Reg dest, const OpArg& arg);
+  void MOVMSKPD(X64Reg dest, const OpArg& arg);
+
+  // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+  // weird one.
+  void MASKMOVDQU(X64Reg dest, X64Reg src);
+  void LDDQU(X64Reg dest, const OpArg& src);
+
+  // SSE/SSE2: Data type conversions.
+  void CVTPS2PD(X64Reg dest, const OpArg& src);
+  void CVTPD2PS(X64Reg dest, const OpArg& src);
+  void CVTSS2SD(X64Reg dest, const OpArg& src);
+  void CVTSI2SS(X64Reg dest, const OpArg& src);
+  void CVTSD2SS(X64Reg dest, const OpArg& src);
+  void CVTSI2SD(X64Reg dest, const OpArg& src);
+  void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+  void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+  void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+  void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+  void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
+
+  // Destinations are X64 regs (rax, rbx, ...) for these instructions.
+  void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+  void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+  void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+  void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
+
+  // SSE2: Packed integer instructions
+  void PACKSSDW(X64Reg dest, const OpArg& arg);
+  void PACKSSWB(X64Reg dest, const OpArg& arg);
+  void PACKUSDW(X64Reg dest, const OpArg& arg);
+  void PACKUSWB(X64Reg dest, const OpArg& arg);
+
+  void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+  void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+  void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+  void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
+
+  void PTEST(X64Reg dest, const OpArg& arg);
+  void PAND(X64Reg dest, const OpArg& arg);
+  void PANDN(X64Reg dest, const OpArg& arg);
+  void PXOR(X64Reg dest, const OpArg& arg);
+  void POR(X64Reg dest, const OpArg& arg);
+
+  void PADDB(X64Reg dest, const OpArg& arg);
+  void PADDW(X64Reg dest, const OpArg& arg);
+  void PADDD(X64Reg dest, const OpArg& arg);
+  void PADDQ(X64Reg dest, const OpArg& arg);
+
+  void PADDSB(X64Reg dest, const OpArg& arg);
+  void PADDSW(X64Reg dest, const OpArg& arg);
+  void PADDUSB(X64Reg dest, const OpArg& arg);
+  void PADDUSW(X64Reg dest, const OpArg& arg);
+
+  void PSUBB(X64Reg dest, const OpArg& arg);
+  void PSUBW(X64Reg dest, const OpArg& arg);
+  void PSUBD(X64Reg dest, const OpArg& arg);
+  void PSUBQ(X64Reg dest, const OpArg& arg);
+
+  void PSUBSB(X64Reg dest, const OpArg& arg);
+  void PSUBSW(X64Reg dest, const OpArg& arg);
+  void PSUBUSB(X64Reg dest, const OpArg& arg);
+  void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+  void PAVGB(X64Reg dest, const OpArg& arg);
+  void PAVGW(X64Reg dest, const OpArg& arg);
+
+  void PCMPEQB(X64Reg dest, const OpArg& arg);
+  void PCMPEQW(X64Reg dest, const OpArg& arg);
+  void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+  void PCMPGTB(X64Reg dest, const OpArg& arg);
+  void PCMPGTW(X64Reg dest, const OpArg& arg);
+  void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+  void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+  void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
+
+  void PMADDWD(X64Reg dest, const OpArg& arg);
+  void PSADBW(X64Reg dest, const OpArg& arg);
+
+  void PMAXSW(X64Reg dest, const OpArg& arg);
+  void PMAXUB(X64Reg dest, const OpArg& arg);
+  void PMINSW(X64Reg dest, const OpArg& arg);
+  void PMINUB(X64Reg dest, const OpArg& arg);
+
+  void PMOVMSKB(X64Reg dest, const OpArg& arg);
+  void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFB(X64Reg dest, const OpArg& arg);
+
+  void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+  void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
+
+  void PSRLW(X64Reg reg, int shift);
+  void PSRLD(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, int shift);
+  void PSRLQ(X64Reg reg, const OpArg& arg);
+  void PSRLDQ(X64Reg reg, int shift);
+
+  void PSLLW(X64Reg reg, int shift);
+  void PSLLD(X64Reg reg, int shift);
+  void PSLLQ(X64Reg reg, int shift);
+  void PSLLDQ(X64Reg reg, int shift);
+
+  void PSRAW(X64Reg reg, int shift);
+  void PSRAD(X64Reg reg, int shift);
+
+  // SSE4: data type conversions
+  void PMOVSXBW(X64Reg dest, const OpArg& arg);
+  void PMOVSXBD(X64Reg dest, const OpArg& arg);
+  void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXWD(X64Reg dest, const OpArg& arg);
+  void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXBW(X64Reg dest, const OpArg& arg);
+  void PMOVZXBD(X64Reg dest, const OpArg& arg);
+  void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXWD(X64Reg dest, const OpArg& arg);
+  void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+  void PMOVZXDQ(X64Reg dest, const OpArg& arg);
+
+  // SSE4: blend instructions
+  void PBLENDVB(X64Reg dest, const OpArg& arg);
+  void BLENDVPS(X64Reg dest, const OpArg& arg);
+  void BLENDVPD(X64Reg dest, const OpArg& arg);
+  void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
+  void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
+
+  // AVX
+  void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare);
+  void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+  void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask);
+  void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+  void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+
+  void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  // FMA3
+  void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+#define FMA4(name)                                                                                 \
+  void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);                          \
+  void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+
+  FMA4(VFMADDSUBPS)
+  FMA4(VFMADDSUBPD)
+  FMA4(VFMSUBADDPS)
+  FMA4(VFMSUBADDPD)
+  FMA4(VFMADDPS)
+  FMA4(VFMADDPD)
+  FMA4(VFMADDSS)
+  FMA4(VFMADDSD)
+  FMA4(VFMSUBPS)
+  FMA4(VFMSUBPD)
+  FMA4(VFMSUBSS)
+  FMA4(VFMSUBSD)
+  FMA4(VFNMADDPS)
+  FMA4(VFNMADDPD)
+  FMA4(VFNMADDSS)
+  FMA4(VFNMADDSD)
+  FMA4(VFNMSUBPS)
+  FMA4(VFNMSUBPD)
+  FMA4(VFNMSUBSS)
+  FMA4(VFNMSUBSD)
+#undef FMA4
+
+  // VEX GPR instructions
+  void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+  void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+  void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+  void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+  void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+  void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+  void RDTSC();
+
+  // Utility functions
+  // The difference between this and CALL is that this aligns the stack
+  // where appropriate.
+  template <typename FunctionPointer>
+  void ABI_CallFunction(FunctionPointer func)
+  {
+    static_assert(std::is_pointer<FunctionPointer>() &&
+                      std::is_function<std::remove_pointer_t<FunctionPointer>>(),
+                  "Supplied type must be a function pointer.");
+
+    const void* ptr = reinterpret_cast<const void*>(func);
+    const u64 address = reinterpret_cast<u64>(ptr);
+    const u64 distance = address - (reinterpret_cast<u64>(code) + 5);
+
+    if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL)
+    {
+      // Far call
+      MOV(64, R(RAX), Imm64(address));
+      CALLptr(R(RAX));
+    }
+    else
+    {
+      CALL(ptr);
+    }
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC16(FunctionPointer func, u16 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionC(FunctionPointer func, u32 param1)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3,
+                            const void* param4)
+  {
+    MOV(32, R(ABI_PARAM1), Imm32(param1));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4)));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3)
+  {
+    MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+    MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+    MOV(32, R(ABI_PARAM3), Imm32(param3));
+    ABI_CallFunction(func);
+  }
+
+  // Pass a register as a parameter.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1)
+  {
+    if (reg1 != ABI_PARAM1)
+      MOV(32, R(ABI_PARAM1), R(reg1));
+    ABI_CallFunction(func);
+  }
+
+  // Pass two registers as parameters.
+  template <typename FunctionPointer>
+  void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2)
+  {
+    MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2);
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    MOV(32, R(ABI_PARAM2), Imm32(param2));
+    ABI_CallFunction(func);
+  }
+
+  template <typename FunctionPointer>
+  void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1)
+  {
+    if (!arg1.IsSimpleReg(ABI_PARAM1))
+      MOV(bits, R(ABI_PARAM1), arg1);
+    ABI_CallFunction(func);
+  }
+
+  // Helper method for ABI functions related to calling functions. May be used by itself as well.
+  void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2);
+
+  // Saves/restores the registers and adjusts the stack to be aligned as
+  // required by the ABI, where the previous alignment was as specified.
+  // Push returns the size of the shadow space, i.e. the offset of the frame.
+  size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                         size_t needed_frame_size = 0);
+  void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                      size_t needed_frame_size = 0);
+
+  // Utility to generate a call to a std::function object.
+  //
+  // Unfortunately, calling operator() directly is undefined behavior in C++
+  // (this method might be a thunk in the case of multi-inheritance) so we
+  // have to go through a trampoline function.
+  template <typename T, typename... Args>
+  static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+  {
+    return (*f)(args...);
+  }
+
+  template <typename T, typename... Args>
+  void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+  {
+    auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>;
+    ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1);
+  }
+};  // class XEmitter
+
+}  // namespace
diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h
new file mode 100644
index 0000000..a92e024
--- /dev/null
+++ b/src/dolphin/x64Reg.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+namespace Gen
+{
+enum X64Reg
+{
+  EAX = 0,
+  EBX = 3,
+  ECX = 1,
+  EDX = 2,
+  ESI = 6,
+  EDI = 7,
+  EBP = 5,
+  ESP = 4,
+
+  RAX = 0,
+  RBX = 3,
+  RCX = 1,
+  RDX = 2,
+  RSI = 6,
+  RDI = 7,
+  RBP = 5,
+  RSP = 4,
+  R8 = 8,
+  R9 = 9,
+  R10 = 10,
+  R11 = 11,
+  R12 = 12,
+  R13 = 13,
+  R14 = 14,
+  R15 = 15,
+
+  AL = 0,
+  BL = 3,
+  CL = 1,
+  DL = 2,
+  SIL = 6,
+  DIL = 7,
+  BPL = 5,
+  SPL = 4,
+  AH = 0x104,
+  BH = 0x107,
+  CH = 0x105,
+  DH = 0x106,
+
+  AX = 0,
+  BX = 3,
+  CX = 1,
+  DX = 2,
+  SI = 6,
+  DI = 7,
+  BP = 5,
+  SP = 4,
+
+  XMM0 = 0,
+  XMM1,
+  XMM2,
+  XMM3,
+  XMM4,
+  XMM5,
+  XMM6,
+  XMM7,
+  XMM8,
+  XMM9,
+  XMM10,
+  XMM11,
+  XMM12,
+  XMM13,
+  XMM14,
+  XMM15,
+
+  YMM0 = 0,
+  YMM1,
+  YMM2,
+  YMM3,
+  YMM4,
+  YMM5,
+  YMM6,
+  YMM7,
+  YMM8,
+  YMM9,
+  YMM10,
+  YMM11,
+  YMM12,
+  YMM13,
+  YMM14,
+  YMM15,
+
+  INVALID_REG = 0xFFFFFFFF
+};
+
+}  // namespace Gen
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
index 09faf4e..9ee7b9a 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
@@ -32,6 +32,7 @@
 EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr;
 
 extern char* EmuDirectory;
+extern bool RunningSomething;
 
 
 EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog)
@@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new
     ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType);
 
     ui->chkDirectBoot->setChecked(Config::DirectBoot != 0);
+
+#ifdef JIT_ENABLED
+    ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0);
+    ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0);
+    ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0);
+    ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0);
+    ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize);
+#else
+    ui->chkEnableJIT->setDisabled(true);
+    ui->chkJITBranchOptimisations->setDisabled(true);
+    ui->chkJITLiteralOptimisations->setDisabled(true);
+    ui->chkJITFastMemory->setDisabled(true);
+    ui->spnJITMaximumBlockSize->setDisabled(true);
+#endif
+
+    on_chkEnableJIT_toggled();
 }
 
 EmuSettingsDialog::~EmuSettingsDialog()
@@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware()
     }
 }
 
-void EmuSettingsDialog::on_EmuSettingsDialog_accepted()
+void EmuSettingsDialog::done(int r)
 {
-    verifyFirmware();
-
-    strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0';
-    strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0';
-    strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0';
-
-    strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
-    strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
-    strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
-    strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
-
-    Config::ConsoleType = ui->cbxConsoleType->currentIndex();
-    Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0;
-
-    Config::Save();
+    if (r == QDialog::Accepted)
+    {
+        verifyFirmware();
+
+        int consoleType = ui->cbxConsoleType->currentIndex();
+        int directBoot = ui->chkDirectBoot->isChecked() ? 1:0;
+
+        int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0;
+        int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value();
+        int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0;
+        int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0;
+        int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0;
+
+        std::string bios9Path = ui->txtBIOS9Path->text().toStdString();
+        std::string bios7Path = ui->txtBIOS7Path->text().toStdString();
+        std::string firmwarePath = ui->txtFirmwarePath->text().toStdString();
+        std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString();
+        std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString();
+        std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString();
+        std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString();
+
+        if (consoleType != Config::ConsoleType
+            || directBoot != Config::DirectBoot
+#ifdef JIT_ENABLED
+            || jitEnable != Config::JIT_Enable
+            || jitMaxBlockSize != Config::JIT_MaxBlockSize
+            || jitBranchOptimisations != Config::JIT_BranchOptimisations
+            || jitLiteralOptimisations != Config::JIT_LiteralOptimisations
+            || jitFastMemory != Config::JIT_FastMemory
+#endif
+            || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0
+            || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0
+            || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0
+            || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0
+            || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0
+            || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0
+            || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0)
+        {
+            if (RunningSomething
+                && QMessageBox::warning(this, "Reset necessary to apply changes", 
+                    "The emulation will be reset for the changes to take place", 
+                    QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes)
+                return;
+
+            strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0';
+            strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0';
+            strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0';
+
+            strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
+            strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
+            strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
+            strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
+
+    #ifdef JIT_ENABLED
+            Config::JIT_Enable = jitEnable;
+            Config::JIT_MaxBlockSize = jitMaxBlockSize;
+            Config::JIT_BranchOptimisations = jitBranchOptimisations;
+            Config::JIT_LiteralOptimisations = jitLiteralOptimisations;
+            Config::JIT_FastMemory = jitFastMemory;
+    #endif
+
+            Config::ConsoleType = consoleType;
+            Config::DirectBoot = directBoot;
+
+            Config::Save();
+        }
+    }
 
-    closeDlg();
-}
+    QDialog::done(r);
 
-void EmuSettingsDialog::on_EmuSettingsDialog_rejected()
-{
     closeDlg();
 }
 
@@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked()
 
     ui->txtDSiNANDPath->setText(file);
 }
+
+void EmuSettingsDialog::on_chkEnableJIT_toggled()
+{
+    bool disabled = !ui->chkEnableJIT->isChecked();
+    ui->chkJITBranchOptimisations->setDisabled(disabled);
+    ui->chkJITLiteralOptimisations->setDisabled(disabled);
+    ui->chkJITFastMemory->setDisabled(disabled);
+    ui->spnJITMaximumBlockSize->setDisabled(disabled);
+}
+\ No newline at end of file
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h
index f604ba5..268036c 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.h
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.h
@@ -51,8 +51,7 @@ public:
     }
 
 private slots:
-    void on_EmuSettingsDialog_accepted();
-    void on_EmuSettingsDialog_rejected();
+    void done(int r);
 
     void on_btnBIOS9Browse_clicked();
     void on_btnBIOS7Browse_clicked();
@@ -63,6 +62,8 @@ private slots:
     void on_btnDSiFirmwareBrowse_clicked();
     void on_btnDSiNANDBrowse_clicked();
 
+    void on_chkEnableJIT_toggled();
+
 private:
     void verifyFirmware();
 
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui
index 4894fa5..11d48cc 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.ui
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>490</width>
-    <height>392</height>
+    <width>514</width>
+    <height>359</height>
    </rect>
   </property>
   <property name="sizePolicy">
@@ -24,243 +24,336 @@
     <enum>QLayout::SetFixedSize</enum>
    </property>
    <item>
-    <widget class="QGroupBox" name="groupBox">
-     <property name="title">
-      <string>DS mode</string>
+    <widget class="QTabWidget" name="tabWidget">
+     <property name="currentIndex">
+      <number>0</number>
      </property>
-     <layout class="QGridLayout" name="gridLayout_2">
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="minimumSize">
-         <size>
-          <width>290</width>
-          <height>0</height>
-         </size>
-        </property>
-        <property name="statusTip">
-         <string/>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_3">
-        <property name="text">
-         <string>DS firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_2">
-        <property name="text">
-         <string>DS ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label">
-        <property name="text">
-         <string>DS ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnBIOS9Browse">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-        <property name="autoDefault">
-         <bool>true</bool>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_3">
-     <property name="title">
-      <string>DSi mode</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout_3">
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS9Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_5">
-        <property name="text">
-         <string>DSi ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnDSiFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtDSiFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_6">
-        <property name="text">
-         <string>DSi ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_7">
-        <property name="text">
-         <string>DSi firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0">
-       <widget class="QLabel" name="label_8">
-        <property name="text">
-         <string>DSi NAND:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="1">
-       <widget class="QLineEdit" name="txtDSiNANDPath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="2">
-       <widget class="QPushButton" name="btnDSiNANDBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>General</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_4">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Console type:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="cbxConsoleType">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0" colspan="2">
-       <widget class="QCheckBox" name="chkDirectBoot">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-        <property name="text">
-         <string>Boot game directly</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
+     <widget class="QWidget" name="tab">
+      <attribute name="title">
+       <string>General</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_4">
+       <item row="1" column="1">
+        <widget class="QComboBox" name="cbxConsoleType">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="1">
+        <widget class="QCheckBox" name="chkDirectBoot">
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+         <property name="text">
+          <string>Boot game directly</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <spacer name="verticalSpacer_2">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_4">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="text">
+          <string>Console type:</string>
+         </property>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_2">
+      <attribute name="title">
+       <string>BIOS Files</string>
+      </attribute>
+      <layout class="QVBoxLayout" name="verticalLayout_2">
+       <item>
+        <widget class="QGroupBox" name="groupBox">
+         <property name="title">
+          <string>DS mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_2">
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>DS firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnBIOS9Browse">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+            <property name="autoDefault">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>DS ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>DS ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="minimumSize">
+             <size>
+              <width>290</width>
+              <height>0</height>
+             </size>
+            </property>
+            <property name="statusTip">
+             <string/>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_3">
+         <property name="title">
+          <string>DSi mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_3">
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS9Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label_5">
+            <property name="text">
+             <string>DSi ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnDSiFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtDSiFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_6">
+            <property name="text">
+             <string>DSi ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_7">
+            <property name="text">
+             <string>DSi firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0">
+           <widget class="QLabel" name="label_8">
+            <property name="text">
+             <string>DSi NAND:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="1">
+           <widget class="QLineEdit" name="txtDSiNANDPath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="2">
+           <widget class="QPushButton" name="btnDSiNANDBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_3">
+      <attribute name="title">
+       <string>CPU Emulation</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_5">
+       <item row="0" column="0">
+        <widget class="QCheckBox" name="chkEnableJIT">
+         <property name="text">
+          <string>Enable JIT recompiler</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_9">
+         <property name="text">
+          <string>Maximum JIT block size:</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="1">
+        <widget class="QSpinBox" name="spnJITMaximumBlockSize">
+         <property name="minimum">
+          <number>1</number>
+         </property>
+         <property name="maximum">
+          <number>32</number>
+         </property>
+         <property name="value">
+          <number>32</number>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="0">
+        <widget class="QCheckBox" name="chkJITBranchOptimisations">
+         <property name="text">
+          <string>Branch Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <widget class="QCheckBox" name="chkJITLiteralOptimisations">
+         <property name="text">
+          <string>Literal Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="4" column="0">
+        <widget class="QCheckBox" name="chkJITFastMemory">
+         <property name="text">
+          <string>Fast Memory</string>
+         </property>
+        </widget>
+       </item>
+       <item row="5" column="0">
+        <spacer name="verticalSpacer">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
     </widget>
    </item>
    <item>
@@ -275,6 +368,27 @@
    </item>
   </layout>
  </widget>
+ <tabstops>
+  <tabstop>tabWidget</tabstop>
+  <tabstop>cbxConsoleType</tabstop>
+  <tabstop>chkDirectBoot</tabstop>
+  <tabstop>txtBIOS9Path</tabstop>
+  <tabstop>txtBIOS7Path</tabstop>
+  <tabstop>txtFirmwarePath</tabstop>
+  <tabstop>txtDSiBIOS9Path</tabstop>
+  <tabstop>txtDSiBIOS7Path</tabstop>
+  <tabstop>txtDSiFirmwarePath</tabstop>
+  <tabstop>txtDSiNANDPath</tabstop>
+  <tabstop>btnBIOS9Browse</tabstop>
+  <tabstop>btnBIOS7Browse</tabstop>
+  <tabstop>btnFirmwareBrowse</tabstop>
+  <tabstop>btnDSiBIOS9Browse</tabstop>
+  <tabstop>btnDSiBIOS7Browse</tabstop>
+  <tabstop>btnDSiFirmwareBrowse</tabstop>
+  <tabstop>btnDSiNANDBrowse</tabstop>
+  <tabstop>chkEnableJIT</tabstop>
+  <tabstop>spnJITMaximumBlockSize</tabstop>
+ </tabstops>
  <resources/>
  <connections>
   <connection>
@@ -284,8 +398,8 @@
    <slot>accept()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>248</x>
-     <y>254</y>
+     <x>257</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>157</x>
@@ -300,8 +414,8 @@
    <slot>reject()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>316</x>
-     <y>260</y>
+     <x>325</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>286</x>
diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp
index 06128d7..bfb3f97 100644
--- a/src/frontend/qt_sdl/PlatformConfig.cpp
+++ b/src/frontend/qt_sdl/PlatformConfig.cpp
@@ -72,6 +72,7 @@ char MicWavPath[1024];
 
 char LastROMFolder[1024];
 
+bool EnableJIT;
 
 ConfigEntry PlatformConfigFile[] =
 {
diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp
index fa542ad..4557d0e 100644
--- a/src/frontend/qt_sdl/main.cpp
+++ b/src/frontend/qt_sdl/main.cpp
@@ -1641,7 +1641,14 @@ void MainWindow::onStop()
 
 void MainWindow::onOpenEmuSettings()
 {
-    EmuSettingsDialog::openDlg(this);
+    EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this);
+    connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished);
+}
+
+void MainWindow::onEmuSettingsDialogFinished(int res)
+{
+    if (RunningSomething)
+        onReset();
 }
 
 void MainWindow::onOpenInputConfig()
diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h
index 279aed8..eec2a48 100644
--- a/src/frontend/qt_sdl/main.h
+++ b/src/frontend/qt_sdl/main.h
@@ -199,6 +199,7 @@ private slots:
     void onStop();
 
     void onOpenEmuSettings();
+    void onEmuSettingsDialogFinished(int res);
     void onOpenInputConfig();
     void onInputConfigFinished(int res);
     void onOpenVideoSettings();
diff --git a/src/version.h b/src/version.h
index 6250601..9084606 100644
--- a/src/version.h
+++ b/src/version.h
@@ -19,7 +19,7 @@
 #ifndef VERSION_H
 #define VERSION_H
 
-#define MELONDS_VERSION    "0.8.3"
+#define MELONDS_VERSION    "0.8.3-JIT"
 
 #define MELONDS_URL        "http://melonds.kuribo64.net/"
 
diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h
new file mode 100644
index 0000000..5d5faf8
--- /dev/null
+++ b/src/xxhash/xxh3.h
@@ -0,0 +1,2390 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file is separated for development purposes.
+ * It will be integrated into `xxhash.h` when development stage is completed.
+ *
+ * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
+ */
+
+#ifndef XXH3_H_1397135465
+#define XXH3_H_1397135465
+
+/* ===   Dependencies   === */
+#ifndef XXHASH_H_5627135585666179
+/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
+#  undef XXH_INLINE_ALL   /* avoid redefinition */
+#  define XXH_INLINE_ALL
+#endif
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0 /* Portable scalar version */
+#define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
+#define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
+#define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
+#define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
+#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator.
+ * This is for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+#  undef vector /* Undo the pollution */
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+ * {
+ *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+ * }
+ */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * We don't need to (or want to) mix as much as XXH64.
+ *
+ * Short hashes are more evenly distributed, so it isn't necessary.
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        xxh_u64 const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len < 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 x = input64 ^ bitflip;
+        /* this mix is inspired by Pelle Evensen's rrmxmx */
+        x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
+        x *= 0x9FB21C651E98DF25ULL;
+        x ^= (x >> 35) + len ;
+        x *= 0x9FB21C651E98DF25ULL;
+        return XXH_xorshift64(x, 28);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(8 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    __asm__ ("" : "+r" (seed64));
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
+
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        if (accWidth == XXH3_acc_128bits) {
+            /* xacc[0] += swap(data_vec); */
+            __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        } else {  /* XXH3_acc_64bits */
+            /* xacc[0] += data_vec; */
+            __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+                __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+#ifdef __s390x__
+            xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
+#else
+            xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+        size_t i;
+        for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+#ifdef __clang__ // for clang
+#  define XXH_PREFETCH_DIST_AVX512_64  320
+#  define XXH_PREFETCH_DIST_AVX512_128 320
+#else // for gcc
+#  define XXH_PREFETCH_DIST_AVX512_64  640
+#  define XXH_PREFETCH_DIST_AVX512_128 512
+#endif
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+#if (XXH_VECTOR == XXH_AVX512)
+        if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
+        else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
+#else
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+#endif
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+            /* Do not align on 8, so that the secret is different from the scrambler */
+#define XXH_SECRET_LASTACC_START 7
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        __asm__("" : "+r" (result64));
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+    /*
+     * We need a separate pointer for the hack below.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8 *kSecretPtr = kSecret;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that kSecretPtr has been changed), the pipelines are used more efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    __asm__("" : "+r" (kSecretPtr));
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == kSecret);
+
+    for (i=0; i < nbRounds; i++) {
+        /*
+         * The asm hack causes Clang to assume that kSecretPtr aliases with
+         * customSecret, and on aarch64, this prevented LDP from merging two
+         * loads together for free. Putting the loads together before the stores
+         * properly generates LDP.
+         */
+        xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+        xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+        XXH_writeLE64(customSecret + 16*i,     lo);
+        XXH_writeLE64(customSecret + 16*i + 8, hi);
+    }
+}
+
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * There is some input left inside the internal buffer.
+         * Fill it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* Consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* Some remaining input: buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc,
+                              state->secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX: short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t h128;
+        h128.low64  = XXH3_avalanche(mixedl);
+        h128.high64 = XXH3_avalanche(mixedh);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
+            h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+         return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         state->secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         state->secret + state->secretLimit + STRIPE_LEN
+                                                       - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH3_H_1397135465 */
diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c
new file mode 100644
index 0000000..0fae88c
--- /dev/null
+++ b/src/xxhash/xxhash.c
@@ -0,0 +1,43 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h
new file mode 100644
index 0000000..67a5887
--- /dev/null
+++ b/src/xxhash/xxhash.h
@@ -0,0 +1,1965 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+/*!
+ * XXH_NAMESPACE, aka Namespace Emulation:
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * XXH32():
+ *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ *  The memory between input & input+length must be valid (allocated and read-accessible).
+ *  "seed" can be used to alter the result predictably.
+ *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*!
+ * XXH64():
+ * Returns the 64-bit hash of sequence of length @length stored at memory
+ * address @input.
+ * @seed can be used to alter the result predictably.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation of an XXH
+ * state, for example, on the stack or in a struct.
+ * Never **ever** access members directly.
+ */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * The XXH3 algorithm is still in development.
+ * The results it produces may still change in future versions.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * Avoid storing values in long-term storage until the algorithm is finalized.
+ *
+ * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
+ * everything remains fine, its current format will be "frozen" and become the
+ * final one.
+ *
+ * After which, return values of XXH3 and XXH128 will no longer change in
+ * future versions.
+ *
+ * XXH3's return values will be officially finalized upon reaching v0.8.0.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional
+ * collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid trivial sequences, such as repeating sequences and especially '\0',
+ * as this can cancel out itself.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly based on the default
+ * secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from the seed. Makes state larger.
+   * Design might change */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   /* note: there is some padding after due to alignment on 64 bytes */
+   const unsigned char* secret;
+};   /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever possible.
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with the default parameters.
+ * The result will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, and must outlive the hash streaming session, so
+ * be careful when using stack arrays.
+ * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         <0 if *h128_1  < *h128_2
+ *         =0 if *h128_1 == *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be found in xxhash.c.
+ *
+ * However, code inlining requires the implementation to be visible to the
+ * compiler, usually within the header.
+ *
+ * As a workaround, xxhash.c used to be included within xxhash.h. This caused
+ * some issues with some build systems, especially ones which treat .c files
+ * as source files.
+ *
+ * Therefore, the implementation is now directly integrated within xxhash.h.
+ * Another small advantage is that xxhash.c is no longer needed in /include.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!
+ * XXH_FORCE_MEMORY_ACCESS:
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow to select a different access method for improved
+ * performance.
+ * Method 0 (default):
+ *     Use `memcpy()`. Safe and portable.
+ * Method 1:
+ *     `__attribute__((packed))` statement. It depends on compiler extensions
+ *     and is therefore not portable.
+ *     This method is safe if your compiler supports it, and *generally* as
+ *     fast or faster than `memcpy`.
+ * Method 2:
+ *     Direct access via cast. This method doesn't depend on the compiler but
+ *     violates the C standard.
+ *     It can generate buggy code on targets which do not support unaligned
+ *     memory accesses.
+ *     But in some circumstances, it's the only known way to get the most
+ *     performance (ie GCC + ARMv6)
+ * Method 3:
+ *     Byteshift. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction.
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!
+ *XXH_ACCEPT_NULL_INPUT_POINTER:
+ * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+ * triggering a segfault.
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!
+ * XXH_FORCE_ALIGN_CHECK:
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means: check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * Set it to 0 when the input is guaranteed to be aligned or when alignment
+ * doesn't matter for performance.
+ *
+ * This option does not affect XXH3.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!
+ * XXH_NO_INLINE_HINTS:
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+/*!
+ * XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang.
+ */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*!
+ * Modify the local functions below should you wish to use some other memory
+ * routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free(void* p) { free(p); }
+
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS /* disable inlining hints */
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#elif defined(_MSC_VER)    /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) \
+    || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*
+ * DEBUGLEVEL is expected to be defined externally, typically via the compiler's
+ * command line options. The value must be a number.
+ */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/*!
+ * XXH_CPU_LITTLE_ENDIAN:
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Nonstandard, but well-defined behavior in practice.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \
+                               && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+     * loop (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize.
+     */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 do {                           \
+    h32 += (*ptr++) * PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1;      \
+} while (0)
+
+#define PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*!
+ * XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+ * performance gain on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+ * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+ * to unroll. The code becomes ridiculously large (the largest function in the
+ * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+ * also slightly faster because it fits into cache better and is more likely
+ * to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+ */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 do {                                   \
+    h64 ^= (*ptr++) * PRIME64_5;                           \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;                 \
+} while (0)
+
+#define PROCESS4_64 do {                                   \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1;      \
+    ptr += 4;                                              \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;     \
+} while (0)
+
+#define PROCESS8_64 do {                                   \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr += 8;                                              \
+    h64 ^= k1;                                             \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;     \
+} while (0)
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
author	Arisotura <thetotalworm@gmail.com>	2020-07-01 00:01:11 +0200
committer	GitHub <noreply@github.com>	2020-07-01 00:01:11 +0200
commit	62c6e2f703d88660e0ca9bda78032c5bd6b63a78 (patch)
tree	1dbf9eb1bbe418d14f07dc3a0e30821fb5deb258
parent	d97ce22b010e868437c649911bce89d679a4deaa (diff)
parent	c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd (diff)