From 9d76d63af5d496e232018d6ddf8ee1e55ad440ad Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 04:33:36 +0200
Subject: jit: make everything configurable

---
 src/Config.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/Config.h')

diff --git a/src/Config.h b/src/Config.h
index 84fd57b..18a7910 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -46,6 +46,9 @@ extern int Threaded3D;
 extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
+extern bool JIT_Enable;
+extern int JIT_MaxBlockSize;
+
 }
 
 #endif // CONFIG_H
-- 
cgit v1.2.3


From 411fb57c07c732a2b60e3566ae045f8f60eea29d Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 19:24:00 +0200
Subject: jit: add compile option

---
 CMakeLists.txt                     | 30 +++++++++++++++++++
 src/ARM.cpp                        | 13 ++++----
 src/ARM.h                          |  6 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 -
 src/CMakeLists.txt                 | 25 +++++++++-------
 src/CP15.cpp                       | 12 ++++++--
 src/Config.cpp                     |  4 +++
 src/Config.h                       |  2 ++
 src/NDS.cpp                        | 26 ++++++++++++++++
 src/dolphin/CodeBlock.h            |  3 --
 src/libui_sdl/DlgEmuSettings.cpp   | 21 +++++++++++--
 src/libui_sdl/main.cpp             |  2 ++
 13 files changed, 151 insertions(+), 55 deletions(-)

(limited to 'src/Config.h')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 048dd44..d59e19c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,36 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+detect_architecture("__x86_64__" x86_64)
+detect_architecture("__i386__" x86)
+detect_architecture("__arm__" ARM)
+detect_architecture("__aarch64__" ARM64)
+
+if (ARCHITECTURE STREQUAL x86_64)
+	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
+endif()
+
+if (ENABLE_JIT)
+	add_definitions(-DJIT_ENABLED)
+endif()
+
 if (CMAKE_BUILD_TYPE STREQUAL Release)
 	option(ENABLE_LTO "Enable link-time optimization" ON)
 else()
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 6cc80c0..eb58d02 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -80,15 +80,8 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
-namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
-
 void ARM::Reset()
 {
-    FILE* blabla = fopen("fhhg", "w");
-    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
-        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
-    fclose(blabla);
-
     Cycles = 0;
     Halted = 0;
 
@@ -548,6 +541,7 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv5::ExecuteJIT()
 {
     if (Halted)
@@ -599,6 +593,7 @@ void ARMv5::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
+#endif
 
 void ARMv4::Execute()
 {
@@ -677,6 +672,7 @@ void ARMv4::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv4::ExecuteJIT()
 {
     if (Halted)
@@ -728,4 +724,5 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
-}
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index 0544301..ecdf5b4 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,7 +52,9 @@ public:
     }
 
     virtual void Execute() = 0;
+#ifdef ENABLE_JIT
     virtual void ExecuteJIT() = 0;
+#endif
 
     bool CheckCondition(u32 code)
     {
@@ -152,7 +154,9 @@ public:
     void DataAbort();
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -271,7 +275,9 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fe23859..18cb27e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,7 +4,10 @@
 
 #include <assert.h>
 
+#include "../dolphin/CommonFuncs.h"
+
 #ifdef _WIN32
+#include <windows.h>
 #else
 #include <sys/mman.h>
 #include <unistd.h>
@@ -32,8 +35,6 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
-int instructionPopularityARM[ARMInstrInfo::ak_Count];
-
 /*
     We'll repurpose this .bss memory
 
@@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32];
 
 Compiler::Compiler()
 {
-#ifdef _WIN32
-#else
-    u64 pagesize = sysconf(_SC_PAGE_SIZE);
-#endif
-
-    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
-    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
-
-#ifdef _WIN32
-#else
-    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
-#endif
-
-    region = pageAligned;
-    region_size = alignedSize;
-    total_region_size = region_size;
+    {
+    #ifdef _WIN32
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+
+        u64 pageSize = (u64)sysInfo.dwPageSize;
+    #else
+        u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    #endif
+
+        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
+        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;
+
+    #ifdef _WIN32
+        DWORD dummy;
+        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
+    #else
+        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+    #endif
+
+        region = pageAligned;
+        region_size = alignedSize;
+        total_region_size = region_size;
+    }
 
     ClearCodeSpace();
 
-    SetCodePtr(pageAligned);
-
-    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
-
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -118,7 +123,7 @@ Compiler::Compiler()
         SetJumpTarget(und);
         MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
-        }
+    }
     {
         // RSCRATCH  mode
         // ABI_PARAM2 reg n
@@ -163,7 +168,10 @@ Compiler::Compiler()
         RET();
     }
 
-    ResetStart = (void*)GetWritableCodePtr();
+    // move the region forward to prevent overwriting the generated functions
+    region_size -= GetWritableCodePtr() - region;
+    total_region_size = region_size;
+    region = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    SetCodePtr((u8*)ResetStart);
+    ClearCodeSpace();
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
@@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (!Thumb)
-            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
-
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index cd58012..0ce7d8d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -132,7 +132,6 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
-    void* ResetStart;
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9401220..10428aa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,19 +30,22 @@ add_library(core STATIC
 	SPU.cpp
 	Wifi.cpp
 	WifiAP.cpp
+)
 
-	ARMJIT.cpp
-	ARMJIT_x64/ARMJIT_Compiler.cpp
-	ARMJIT_x64/ARMJIT_ALU.cpp
-	ARMJIT_x64/ARMJIT_LoadStore.cpp
-	ARMJIT_x64/ARMJIT_Branch.cpp
+if (ENABLE_JIT)
+	target_sources(core PRIVATE
+		ARMJIT.cpp
+		ARMJIT_x64/ARMJIT_Compiler.cpp
+		ARMJIT_x64/ARMJIT_ALU.cpp
+		ARMJIT_x64/ARMJIT_LoadStore.cpp
+		ARMJIT_x64/ARMJIT_Branch.cpp
 
-	dolphin/CommonFuncs.cpp
-	dolphin/x64ABI.cpp
-	dolphin/x64CPUDetect.cpp
-	dolphin/x64Emitter.cpp
-	dolphin/MemoryUtil.cpp
-)
+		dolphin/CommonFuncs.cpp
+		dolphin/x64ABI.cpp
+		dolphin/x64CPUDetect.cpp
+		dolphin/x64Emitter.cpp
+	)
+endif()
 
 if (WIN32)
 	target_link_libraries(core ole32 comctl32 ws2_32 opengl32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index f232bec..e6e91c3 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -812,7 +812,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -834,7 +836,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -856,8 +860,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -879,8 +885,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
+#ifdef JIT_ENABLED
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/Config.cpp b/src/Config.cpp
index 37b701c..3cff0ed 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -34,8 +34,10 @@ int Threaded3D;
 int GL_ScaleFactor;
 int GL_Antialias;
 
+#ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+#endif
 
 ConfigEntry ConfigFile[] =
 {
@@ -45,8 +47,10 @@ ConfigEntry ConfigFile[] =
     {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0},
     {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0},
 
+#ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+#endif
 
     {"", -1, NULL, 0, NULL, 0}
 };
diff --git a/src/Config.h b/src/Config.h
index 18a7910..c13eae3 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -46,8 +46,10 @@ extern int Threaded3D;
 extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
+#ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+#endif
 
 }
 
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 4b50d9c..62a52aa 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -162,7 +162,9 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+#ifdef JIT_ENABLED
     ARMJIT::Init();
+#endif
 
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
@@ -194,7 +196,9 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+#ifdef JIT_ENABLED
     ARMJIT::DeInit();
+#endif
 
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
@@ -524,7 +528,9 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+#ifdef JIT_ENABLED
     ARMJIT::InvalidateBlockCache();
+#endif
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -741,10 +747,12 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+#ifdef JIT_ENABLED
     if (!file->Saving)
     {
         ARMJIT::InvalidateBlockCache();
     }
+#endif
 
     return true;
 }
@@ -864,9 +872,11 @@ u32 RunFrame()
         }
         else
         {
+#ifdef JIT_ENABLED
             if (EnableJIT)
                 ARM9->ExecuteJIT();
             else
+#endif
                 ARM9->Execute();
         }
 
@@ -889,9 +899,11 @@ u32 RunFrame()
             }
             else
             {
+#ifdef JIT_ENABLED
                 if (EnableJIT)
                     ARM7->ExecuteJIT();
                 else
+#endif
                     ARM7->Execute();
             }
 
@@ -924,9 +936,11 @@ u32 RunFrame()
 
 u32 RunFrame()
 {
+#ifdef JIT_ENABLED
     if (Config::JIT_Enable)
         return RunFrame<true>();
     else
+#endif
         return RunFrame<false>();
 }
 
@@ -1849,7 +1863,9 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -1901,7 +1917,9 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -1969,7 +1987,9 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2264,7 +2284,9 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2325,7 +2347,9 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2396,7 +2420,9 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 31a8d93..e71cf6d 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -9,7 +9,6 @@
 
 #include "Assert.h"
 #include "../types.h"
-#include "MemoryUtil.h"
 
 namespace Common
 {
@@ -41,8 +40,6 @@ public:
   CodeBlock() = default;
   virtual ~CodeBlock()
   {
-    if (region)
-      FreeCodeSpace();
   }
   CodeBlock(const CodeBlock&) = delete;
   CodeBlock& operator=(const CodeBlock&) = delete;
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 116d2da..46f5f9f 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -38,8 +38,10 @@ uiWindow* win;
 
 uiCheckbox* cbDirectBoot;
 
+#ifdef JIT_ENABLED
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
+#endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
 {
@@ -57,13 +59,17 @@ void OnOk(uiButton* btn, void* blarg)
 {
     Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
 
+#ifdef JIT_ENABLED
     Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled);
-    long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10);
+    char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
+    long blockSize = strtol(maxBlockSizeStr, NULL, 10);
+    uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
     Config::JIT_MaxBlockSize = blockSize;
+#endif
 
     Config::Save();
 
@@ -73,6 +79,7 @@ void OnOk(uiButton* btn, void* blarg)
     ApplyNewSettings(4);
 }
 
+#ifdef JIT_ENABLED
 void OnJITStateChanged(uiCheckbox* cb, void* blarg)
 {
     if (uiCheckboxChecked(cb))
@@ -80,6 +87,7 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg)
     else
         uiControlDisable(uiControl(enJITMaxBlockSize));
 }
+#endif
 
 void Open()
 {
@@ -90,7 +98,7 @@ void Open()
     }
 
     opened = true;
-    win = uiNewWindow("Emu settings - melonDS", 300, 170, 0, 0, 0);
+    win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0);
     uiWindowSetMargined(win, 1);
     uiWindowOnClosing(win, OnCloseWindow, NULL);
 
@@ -105,6 +113,7 @@ void Open()
         uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0);
     }
 
+#ifdef JIT_ENABLED
     {
         uiLabel* dummy = uiNewLabel("");
         uiBoxAppend(top, uiControl(dummy), 0);
@@ -133,6 +142,12 @@ void Open()
             uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
         }
     }
+#endif
+
+    {
+        uiLabel* dummy = uiNewLabel("");
+        uiBoxAppend(top, uiControl(dummy), 0);
+    }
 
     {
         uiBox* in_ctrl = uiNewHorizontalBox();
@@ -153,6 +168,7 @@ void Open()
 
     uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
 
+#ifdef JIT_ENABLED
     uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable);
     {
         char maxBlockSizeStr[10];
@@ -160,6 +176,7 @@ void Open()
         uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
     }
     OnJITStateChanged(cbJITEnabled, NULL);
+#endif
 
     uiControlShow(uiControl(win));
 }
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index d6809c3..af05d7a 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2411,8 +2411,10 @@ void ApplyNewSettings(int type)
     }
     else if (type == 4)
     {
+#ifdef JIT_ENABLED
         if (Config::JIT_Enable)
             ARMJIT::InvalidateBlockCache();
+#endif
     }
 
     EmuRunning = prevstatus;
-- 
cgit v1.2.3


From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:10:59 +0200
Subject: new block cache and much more... - more reliable code invalidation
 detection - blocks aren't stopped at any branch, but are being followed if
 possible to get larger blocks - idle loop recognition - optimised literal
 loads, load/store cycle counting  and loads/stores from constant addresses

---
 src/ARM.cpp                         |  44 ++-
 src/ARM.h                           |  16 +-
 src/ARMInterpreter.h                |   9 +
 src/ARMJIT.cpp                      | 755 ++++++++++++++++++++++++++++++------
 src/ARMJIT.h                        | 141 ++-----
 src/ARMJIT_Internal.h               | 198 ++++++++++
 src/ARMJIT_RegisterCache.h          |  36 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  16 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 184 +++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  51 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++----------------
 src/ARM_InstrInfo.cpp               |  47 ++-
 src/ARM_InstrInfo.h                 |  11 +-
 src/CP15.cpp                        |  12 +-
 src/Config.cpp                      |   2 +
 src/Config.h                        |   1 +
 src/NDS.cpp                         |  22 +-
 src/libui_sdl/DlgEmuSettings.cpp    |  22 +-
 19 files changed, 1550 insertions(+), 689 deletions(-)
 create mode 100644 src/ARMJIT_Internal.h

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index e404943..423c940 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
 
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
             {
                 NDS::ARM9Timestamp = NDS::ARM9Target;
             }
             break;
         }
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT()
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
 
         // TODO optimize this shit!!!
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
             {
                 NDS::ARM7Timestamp = NDS::ARM7Target;
             }
             break;
         }
-
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT()
 
 void ARMv5::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         if ((R[15] - 2) & 0x2)
@@ -758,6 +770,8 @@ void ARMv5::FillPipeline()
 
 void ARMv4::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         NextInstr[0] = CodeRead16(R[15] - 2);
diff --git a/src/ARM.h b/src/ARM.h
index 4d387bc..8a01068 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -299,7 +299,7 @@ public:
     {
         *val = NDS::ARM7Read8(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead16(u32 addr, u32* val)
@@ -308,7 +308,7 @@ public:
 
         *val = NDS::ARM7Read16(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead32(u32 addr, u32* val)
@@ -317,7 +317,7 @@ public:
 
         *val = NDS::ARM7Read32(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataRead32S(u32 addr, u32* val)
@@ -325,14 +325,14 @@ public:
         addr &= ~3;
 
         *val = NDS::ARM7Read32(addr);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
     void DataWrite8(u32 addr, u8 val)
     {
         NDS::ARM7Write8(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite16(u32 addr, u16 val)
@@ -341,7 +341,7 @@ public:
 
         NDS::ARM7Write16(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite32(u32 addr, u32 val)
@@ -350,7 +350,7 @@ public:
 
         NDS::ARM7Write32(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataWrite32S(u32 addr, u32 val)
@@ -358,7 +358,7 @@ public:
         addr &= ~3;
 
         NDS::ARM7Write32(addr, val);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
 
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index 7244238..2bf8167 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -28,6 +28,15 @@ namespace ARMInterpreter
 extern void (*ARMInstrTable[4096])(ARM* cpu);
 extern void (*THUMBInstrTable[1024])(ARM* cpu);
 
+void A_MSR_IMM(ARM* cpu);
+void A_MSR_REG(ARM* cpu);
+void A_MRS(ARM* cpu);
+void A_MCR(ARM* cpu);
+void A_MRC(ARM* cpu);
+void A_SVC(ARM* cpu);
+
+void T_SVC(ARM* cpu);
+
 void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
 
 }
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 85cadf3..686bdd6 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,122 +1,137 @@
 #include "ARMJIT.h"
 
 #include <string.h>
+#include <assert.h>
 
 #include "Config.h"
 
+#include "ARMJIT_Internal.h"
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
+#include "ARMInterpreter_ALU.h"
+#include "ARMInterpreter_LoadStore.h"
+#include "ARMInterpreter_Branch.h"
+#include "ARMInterpreter.h"
+
+#include "GPU3D.h"
+#include "SPU.h"
+#include "Wifi.h"
+
 namespace ARMJIT
 {
 
+#define JIT_DEBUGPRINT(msg, ...)
+
 Compiler* compiler;
-BlockCache cache;
 
-#define DUP2(x) x, x
+const u32 ExeMemRegionSizes[] = {
+	0x8000,			// Unmapped Region (dummy)
+	0x8000, 		// ITCM
+	4*1024*1024, 	// Main RAM
+	0x8000, 		// SWRAM
+	0xA4000, 		// LCDC
+	0x8000, 		// ARM9 BIOS
+	0x4000, 		// ARM7 BIOS
+	0x10000,		// ARM7 WRAM
+	0x40000			// ARM7 WVRAM
+};
 
-static ptrdiff_t JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
-		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/		 -1, 
-					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
-		/* 1X*/	DUP2(-1),
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	     offsetof(BlockCache, SWRAM),
-		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(-1)
-		}
+const u32 ExeMemRegionOffsets[] = {
+	0,
+	0x8000,
+	0x10000,
+	0x410000,
+	0x418000,
+	0x4BC000,
+	0x4C4000,
+	0x4C8000,
+	0x4D8000,
+	0x518000,
 };
 
-static u32 JIT_MASK[2][32] = {
+#define DUP2(x) x, x
+
+const static ExeMemKind JIT_MEM[2][32] = {
 	//arm9
 	{
-		/* 0X*/	DUP2(0x00007FFF),
-		/* 1X*/	DUP2(0x00007FFF),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	DUP2(0x00007FFF),
-		/* 4X*/	DUP2(0x00000000),
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/		 0x00000000,
-					 0x000FFFFF,
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00007FFF)
+		/* 0X*/	DUP2(exeMem_ITCM),
+		/* 1X*/	DUP2(exeMem_ITCM), // mirror
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	DUP2(exeMem_SWRAM),
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/		 exeMem_Unmapped, 
+					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_ARM9_BIOS)
 	},
 	//arm7
 	{
-		/* 0X*/	DUP2(0x00003FFF),
-		/* 1X*/	DUP2(0x00000000),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	     0x00007FFF,
-		             0x0000FFFF,
-		/* 4X*/	     0x00000000,
-		             0x0000FFFF,
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/ DUP2(0x0003FFFF),
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00000000)
+		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
+		/* 1X*/	DUP2(exeMem_Unmapped),
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	     exeMem_SWRAM,
+		             exeMem_ARM7_WRAM,
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_Unmapped)
 		}
 };
 
 #undef DUP2
 
+/*
+	translates address to pseudo physical address
+		- more compact, eliminates mirroring, everything comes in a row
+		- we only need one translation table
+*/
+u32 AddrTranslate9[0x2000];
+u32 AddrTranslate7[0x4000];
 
-void Init()
+JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
+AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+TinyVector<JitBlock*> JitBlocks;
+JitBlock* RestoreCandidates[0x1000] = {NULL};
+
+u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
 {
-    memset(&cache, 0, sizeof(BlockCache));
+	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
+}
 
+void Init()
+{
 	for (int i = 0; i < 0x2000; i++)
-		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
-			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[0][i >> 8];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
+	}
 	for (int i = 0; i < 0x4000; i++)
-		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
-			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[1][i >> 9];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
+	}
 
 	compiler = new Compiler();
 }
@@ -126,7 +141,7 @@ void DeInit()
 	delete compiler;
 }
 
-void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 {
 	for (int j = start; j >= 0; j--)
 	{
@@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+{
+	if (thumb)
+	{
+		u32 r15 = instr.Addr + 4;
+		cond = 0xE;
+
+		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
+		{
+			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
+    		targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_B)
+		{
+			s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
+		{
+			cond = (instr.Instr >> 8) & 0xF;
+			s32 offset = (s32)(instr.Instr << 24) >> 23;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	else
+	{
+		cond = instr.Cond();
+		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
+			|| instr.Info.Kind == ARMInstrInfo::ak_B)
+		{
+			s32 offset = (s32)(instr.Instr << 8) >> 6;
+			u32 r15 = instr.Addr + 8;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
+{
+	// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
+	// it basically checks if one iteration of a loop depends on another
+	// the rules are quite simple
+
+	u16 regsWrittenTo = 0;
+	u16 regsDisallowedToWrite = 0;
+	for (int i = 0; i < instrsCount; i++)
+	{
+		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
+			return false;
+		if (i < instrsCount - 1 && instrs[i].Info.Branches())
+			return false;
+
+		u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
+		u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
+
+		regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
+		
+		if (dstRegs & regsDisallowedToWrite)
+			return false;
+		regsWrittenTo |= dstRegs;
+	}
+	return true;
+}
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+
+#define F(x) &ARMInterpreter::A_##x
+#define F_ALU(name, s) \
+	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
+	F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
+#define F_MEM_WB(name) \
+	F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
+	F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
+#define F_MEM_HD(name) \
+	F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
+InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
+{
+	F_ALU(AND,), F_ALU(AND,_S),
+	F_ALU(EOR,), F_ALU(EOR,_S),
+	F_ALU(SUB,), F_ALU(SUB,_S),
+	F_ALU(RSB,), F_ALU(RSB,_S),
+	F_ALU(ADD,), F_ALU(ADD,_S),
+	F_ALU(ADC,), F_ALU(ADC,_S),
+	F_ALU(SBC,), F_ALU(SBC,_S),
+	F_ALU(RSC,), F_ALU(RSC,_S),
+	F_ALU(ORR,), F_ALU(ORR,_S),
+	F_ALU(MOV,), F_ALU(MOV,_S),
+	F_ALU(BIC,), F_ALU(BIC,_S),
+	F_ALU(MVN,), F_ALU(MVN,_S),
+	F_ALU(TST,),
+	F_ALU(TEQ,),
+	F_ALU(CMP,),
+	F_ALU(CMN,),
+
+	F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
+	F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
+
+	F_MEM_WB(STR),
+	F_MEM_WB(STRB),
+	F_MEM_WB(LDR),
+	F_MEM_WB(LDRB),
+
+	F_MEM_HD(STRH),
+	F_MEM_HD(LDRD),
+	F_MEM_HD(STRD),
+	F_MEM_HD(LDRH),
+	F_MEM_HD(LDRSB),
+	F_MEM_HD(LDRSH),
+
+	F(SWP), F(SWPB),
+	F(LDM), F(STM),
+
+	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+};
+#undef F_ALU
+#undef F_MEM_WB
+#undef F_MEM_HD
+#undef F
+
+#define F(x) ARMInterpreter::T_##x
+InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
+{
+	F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
+	F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
+	F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
+	F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
+	F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
+	F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
+	F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
+	F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
+	F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
+	F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
+	F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
+	F(PUSH), F(POP), F(LDMIA), F(STMIA),
+	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
+	F(UNK), F(SVC), 
+	NULL // BL_LONG psudo opcode
+};
+#undef F
+
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu)
 	if (Config::JIT_MaxBlockSize > 32)
 		Config::JIT_MaxBlockSize = 32;
 
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
+    if (!(cpu->Num == 0 
+        ? IsMapped<0>(blockAddr)
+        : IsMapped<1>(blockAddr)))
+    {
+        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
+    }
+	
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr<0>(blockAddr)
+			: TranslateAddr<1>(blockAddr);
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+
+	u32 addresseRanges[32] = {};
+	u32 numAddressRanges = 0;
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
+
+	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
+		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+
+	u32 lastSegmentStart = blockAddr;
+
     do
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
@@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
 		nextInstrAddr[1] = r15;
+		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
+
+		u32 translatedAddr = (cpu->Num == 0
+			? TranslateAddr<0>(instrs[i].Addr)
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		{
+			bool returning = false;
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (addresseRanges[j] == translatedAddr)
+				{
+					returning = true;
+					break;
+				}
+			}
+			if (!returning)
+				addresseRanges[numAddressRanges++] = translatedAddr;
+		}
 
         if (cpu->Num == 0)
         {
@@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		cpu->R[15] = r15;
+		cpu->CurInstr = instrs[i].Instr;
+		cpu->CodeCycles = instrs[i].CodeCycles;
+
+		if (thumb)
+		{
+			InterpretTHUMB[instrs[i].Info.Kind](cpu);
+		}
+		else
+		{
+			if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+			{
+				ARMInterpreter::A_BLX_IMM(cpu);
+			}
+			else
+			{
+                u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				if (cpu->CheckCondition(instrs[i].Cond()))
+					InterpretARM[instrs[i].Info.Kind](cpu);
+				else
+					cpu->AddCycles_C();
+			}
+		}
+
+		instrs[i].DataCycles = cpu->DataCycles;
+		instrs[i].DataRegion = cpu->DataRegion;
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu)
 			instrs[i - 1].Info.EndBlock = true;
 			i--;
 		}
-        i++;
 
+		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		{
+			bool hasBranched = cpu->R[15] != r15;
+
+			u32 cond, target;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
+
+			if (staticBranch)
+			{
+				bool isBackJump = false;
+				if (hasBranched)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						if (instrs[i].Addr == target)
+						{
+							isBackJump = true;
+							break;
+						}
+					}
+				}
+
+				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
+				{
+					// we might have an idle loop
+					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
+					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					{
+						instrs[i].BranchFlags |= branch_IdleBranch;
+						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
+					}
+				}
+				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				{
+					u32 targetPseudoPhysical = cpu->Num == 0
+						? TranslateAddr<0>(target)
+						: TranslateAddr<1>(target);
+					
+					r15 = target + (thumb ? 2 : 4);
+					assert(r15 == cpu->R[15]);
+
+					JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
+
+					nextInstr[0] = cpu->NextInstr[0];
+					nextInstr[1] = cpu->NextInstr[1];
+
+					nextInstrAddr[0] = target;
+					nextInstrAddr[1] = r15;
+
+					lastSegmentStart = target;
+
+					instrs[i].Info.EndBlock = false;
+
+					if (cond < 0xE)
+						instrs[i].BranchFlags |= branch_FollowCondTaken;
+				}
+			}
+
+			if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
+			{
+				instrs[i].Info.EndBlock = false;
+				instrs[i].BranchFlags |= branch_FollowCondNotTaken;
+			}
+		}
+
+        i++;
 
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
-		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
-			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
+		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
+		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
+			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
 
-	floodFillSetFlags(instrs, i - 1, 0xF);
+	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	bool mayRestore = true;
+	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	{
+		RestoreCandidates[restoreSlot] = NULL;	
+		if (prevBlock->NumInstrs == i)
+		{
+			for (int j = 0; j < i; j++)
+			{
+				if (prevBlock->Instrs()[j] != instrs[j].Instr)
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
 
-    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+		if (prevBlock->NumAddresses == numAddressRanges)
+		{
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
+	}
+	else
+	{
+		mayRestore = false;
+		prevBlock = NULL;
+	}
 
-	if (cpu->Num == 0)
-    	InsertBlock<0>(blockAddr, block);
+	JitBlock* block;
+	if (!mayRestore)
+	{
+		if (prevBlock)
+			delete prevBlock;
+
+		block = new JitBlock(i, numAddressRanges);
+		for (int j = 0; j < i; j++)
+			block->Instrs()[j] = instrs[j].Instr;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addresseRanges[j];
+
+		block->StartAddr = blockAddr;
+		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+
+		FloodFillSetFlags(instrs, i - 1, 0xF);
+
+		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+	}
 	else
-    	InsertBlock<1>(blockAddr, block);
+	{
+		JIT_DEBUGPRINT("restored! %p\n", prevBlock);
+		block = prevBlock;
+	}
+
+	for (int j = 0; j < numAddressRanges; j++)
+	{
+		assert(addresseRanges[j] == block->AddressRanges()[j]);
+		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+	}
+
+	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
 
-	return block;
+	JitBlocks.Add(block);
 }
 
-void InvalidateBlockCache()
+void InvalidateByAddr(u32 pseudoPhysical)
 {
-	printf("Resetting JIT block cache...\n");
+	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	int startLength = range->Blocks.Length;
+	for (int i = 0; i < range->Blocks.Length; i++)
+	{
+		assert(range->Blocks.Length == startLength);
+		JitBlock* block = range->Blocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			if ((addr / 256) != (pseudoPhysical / 256))
+			{
+				AddressRange* otherRange = &CodeRanges[addr / 256];
+				assert(otherRange != range);
+				assert(otherRange->Blocks.RemoveByValue(block));
+			}
+		}
+
+		assert(JitBlocks.RemoveByValue(block));
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+
+		RestoreCandidates[slot] = block;
+	}
+	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
+		range->TimesInvalidated++;
+	
+	range->Blocks.Clear();
+}
+
+void InvalidateByAddr7(u32 addr)
+{
+	u32 pseudoPhysical = TranslateAddr<1>(addr);
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateITCM(u32 addr)
+{
+	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
+	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateAll()
+{
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeRanges[addr / 256];
+			range->Blocks.Clear();
+			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
+				range->TimesInvalidated++;
+		}
+
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+		
+		RestoreCandidates[slot] = block;
+	}
+
+	JitBlocks.Clear();
+}
+
+void ResetBlockCache()
+{
+	printf("Resetting JIT block cache...\n");
+	
+	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
+	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+	{
+		if (RestoreCandidates[i])
+		{
+			delete RestoreCandidates[i];
+			RestoreCandidates[i] = NULL;
+		}
+	}
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 256].Blocks.Clear();
+			CodeRanges[addr / 256].TimesInvalidated = 0;
+		}
+		delete block;
+	}
+	JitBlocks.Clear();
 
 	compiler->Reset();
 }
 
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		if ((addr & 0xFF000000) == 0x04000000)
+		{
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;		
+			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size == 16)
+			{
+				if (store)
+					return (void*)Wifi::Write;
+				else
+					return (void*)Wifi::Read;
+			}
+			break;
+		}
+	}
+	return NULL;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 7e448ef..1db4d66 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,142 +9,67 @@
 namespace ARMJIT
 {
 
-typedef u32 (*CompiledBlock)();
-
-struct FetchedInstr
+enum ExeMemKind
 {
-    u32 A_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0xF;
-    }
-
-    u32 T_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0x7;
-    }
-
-    u32 Cond() const
-    {
-        return Instr >> 28;
-    }
-
-	u8 SetFlags;
-    u32 Instr;
-    u32 NextInstr[2];
-	u32 Addr;
-
-    u8 CodeCycles;
-
-    ARMInstrInfo::Info Info;
+	exeMem_Unmapped = 0,
+	exeMem_ITCM,
+	exeMem_MainRAM,
+	exeMem_SWRAM,
+	exeMem_LCDC,
+	exeMem_ARM9_BIOS,
+	exeMem_ARM7_BIOS,
+	exeMem_ARM7_WRAM,
+	exeMem_ARM7_WVRAM,
+	exeMem_Count
 };
 
-/* 
-	Copied from DeSmuME
-	Some names where changed to match the nomenclature of melonDS
+extern const u32 ExeMemRegionOffsets[];
+extern const u32 ExeMemRegionSizes[];
 
-	Since it's nowhere explained and atleast I needed some time to get behind it,
-	here's a summary on how it works:
-		more or less all memory locations from which code can be executed are
-		represented by an array of function pointers, which point to null or
-		a function which executes a block instructions starting from there.
+typedef u32 (*JitBlockEntry)();
 
-		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
-		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
-		are the sizes of the smallest contigous memory region mapped to the respective CPU.
-		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
-		we only need every second half word to be adressable.
+extern u32 AddrTranslate9[0x2000];
+extern u32 AddrTranslate7[0x4000];
 
-		In case a memory write hits mapped memory, the function block at this
-		address is set to null, so it's recompiled the next time it's executed.
-
-		This method has disadvantages, namely that only writing to the
-		first instruction of a block marks it as invalid and that memory remapping
-        (SWRAM and VRAM) isn't taken into account.
-*/
-
-struct BlockCache
-{
-    CompiledBlock* AddrMapping9[0x2000] = {0};
-    CompiledBlock* AddrMapping7[0x4000] = {0};
-
-    CompiledBlock MainRAM[4*1024*1024/2];
-	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
-	CompiledBlock ARM9_ITCM[0x8000/2];
-	CompiledBlock ARM9_LCDC[0xA4000/2];
-	CompiledBlock ARM9_BIOS[0x8000/2];
-	CompiledBlock ARM7_BIOS[0x4000/2];
-	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
-};
-
-extern BlockCache cache;
+const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
+extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
 }
 
 template <u32 num>
-inline CompiledBlock LookUpBlock(u32 addr)
+inline u32 TranslateAddr(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
 template <u32 num>
-inline void Invalidate16(u32 addr)
+inline JitBlockEntry LookUpBlock(u32 addr)
 {
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
-		else
-			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
-	}
-}
-
-template <u32 num>
-inline void Invalidate32(u32 addr)
-{
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-		{
-			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
-			page[(addr & 0x7FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
-		}
-		else
-		{
-			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
-			page[(addr & 0x3FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
-		}
-	}
-}
-
-template <u32 num>
-inline void InsertBlock(u32 addr, CompiledBlock func)
-{
-	if (num == 0)
-		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
-	else
-		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
 }
 
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateAll();
+
+void InvalidateITCM(u32 addr);
+void InvalidateByAddr7(u32 addr);
+
+void CompileBlock(ARM* cpu);
 
-void InvalidateBlockCache();
+void ResetBlockCache();
 
 }
 
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
new file mode 100644
index 0000000..4acb488
--- /dev/null
+++ b/src/ARMJIT_Internal.h
@@ -0,0 +1,198 @@
+#ifndef ARMJIT_INTERNAL_H
+#define ARMJIT_INTERNAL_H
+
+#include "types.h"
+#include <stdint.h>
+
+#include "ARMJIT.h"
+
+// here lands everything which doesn't fit into ARMJIT.h
+// where it would be included by pretty much everything
+namespace ARMJIT
+{
+
+enum
+{
+	branch_IdleBranch = 1 << 0,
+	branch_FollowCondTaken = 1 << 1,
+	branch_FollowCondNotTaken = 1 << 2
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+	u8 BranchFlags;
+	u8 SetFlags;
+    u32 Instr;
+    u32 NextInstr[2];
+	u32 Addr;
+
+    u8 CodeCycles;
+	u8 DataCycles;
+	u8 DataRegion;
+
+    ARMInstrInfo::Info Info;
+};
+
+/*
+	TinyVector
+		- because reinventing the wheel is the best!
+	
+	- meant to be used very often, with not so many elements
+	max 1 << 16 elements
+	- doesn't allocate while no elements are inserted
+	- not stl confirmant of course
+	- probably only works with POD types
+	- remove operations don't preserve order, but O(1)!
+*/
+template <typename T>
+struct __attribute__((packed)) TinyVector
+{
+	T* Data = NULL;
+	u16 Capacity = 0;
+	u32 Length = 0; // make it 32 bit so we don't need movzx
+
+	~TinyVector()
+	{
+		delete[] Data;
+	}
+
+	void MakeCapacity(u32 capacity)
+	{
+		assert(capacity <= UINT16_MAX);
+		assert(capacity > Capacity);
+		T* newMem = new T[capacity];
+		if (Data != NULL)
+			memcpy(newMem, Data, sizeof(Data) * Length);
+
+		T* oldData = Data;
+		Data = newMem;
+		if (oldData != NULL)
+			delete[] oldData;
+		
+		Capacity = capacity;
+	}
+
+	void Clear()
+	{
+		Length = 0;
+	}
+
+	void Add(T element)
+	{
+		assert(Length + 1 <= UINT16_MAX);
+		if (Length + 1 > Capacity)
+			MakeCapacity(((Capacity + 4) * 3) / 2);
+		
+		Data[Length++] = element;
+	}
+
+	void Remove(int index)
+	{
+		assert(index >= 0 && index < Length);
+
+		Length--;
+		Data[index] = Data[Length];
+		/*for (int i = index; i < Length; i++)
+			Data[i] = Data[i + 1];*/
+	}
+
+	int Find(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+				return i;
+		}
+		return -1;
+	}
+
+	bool RemoveByValue(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+			{
+				Remove(i);
+				return true;
+			}
+		}
+		return false;
+	}
+
+	T& operator[](int index)
+	{
+		assert(index >= 0 && index < Length);
+		return Data[index];
+	}
+};
+
+class JitBlock
+{
+public:
+	JitBlock(u32 numInstrs, u32 numAddresses)
+	{
+		NumInstrs = numInstrs;
+		NumAddresses = numAddresses;
+		Data = new u32[numInstrs + numAddresses];
+	}
+
+	~JitBlock()
+	{
+		delete[] Data;
+	}
+
+	u32 StartAddr;
+	u32 PseudoPhysicalAddr;
+	
+	u32 NumInstrs;
+	u32 NumAddresses;
+
+	JitBlockEntry EntryPoint;
+
+	u32* Instrs()
+	{ return Data; }
+	u32* AddressRanges()
+	{ return Data + NumInstrs; }
+
+private:
+	/*
+		0..<NumInstrs - the instructions of the block
+		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
+			(atleast one, the pseudo physical address of the block)
+	*/
+	u32* Data;
+};
+
+// size should be 16 bytes because I'm to lazy to use mul and whatnot
+struct __attribute__((packed)) AddressRange
+{
+	TinyVector<JitBlock*> Blocks;
+	u16 TimesInvalidated;
+};
+
+extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+extern InterpreterFunc InterpretARM[];
+extern InterpreterFunc InterpretTHUMB[];
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index fe2f203..ed6a2b7 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -60,15 +60,46 @@ public:
         assert("Welp!");
     }
 
+    void PutLiteral(int reg, u32 val)
+    {
+        LiteralsLoaded |= (1 << reg);
+        LiteralValues[reg] = val;
+    }
+
+    void UnloadLiteral(int reg)
+    {
+        LiteralsLoaded &= ~(1 << reg);
+    }
+
+    bool IsLiteral(int reg)
+    {
+        return LiteralsLoaded & (1 << reg);
+    }
+
+    void PrepareExit()
+    {
+        BitSet16 dirtyRegs(DirtyRegs);
+        for (int reg : dirtyRegs)
+            Compiler->SaveReg(reg, Mapping[reg]);
+    }
+
     void Flush()
     {
         BitSet16 loadedSet(LoadedRegs);
         for (int reg : loadedSet)
             UnloadRegister(reg);
+        LiteralsLoaded = 0;
     }
 
 	void Prepare(bool thumb, int i)
     {
+        if (LoadedRegs & (1 << 15))
+            UnloadRegister(15);
+
+        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        for (int reg : invalidedLiterals)
+            UnloadLiteral(reg);
+
         u16 futureNeeded = 0;
         int ranking[16];
         for (int j = 0; j < 16; j++)
@@ -86,7 +117,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-		FetchedInstr Instr = Instrs[i];
+        FetchedInstr Instr = Instrs[i];
         u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -125,6 +156,9 @@ public:
 	static const int NativeRegsAvailable;
 
 	Reg Mapping[16];
+    u32 LiteralValues[16];
+
+    u16 LiteralsLoaded = 0;
 	u32 NativeRegsUsed = 0;
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f868ddf..14c223b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp()
         MOV(32, rd, op2);
 
     if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
+    {
         NOT(32, rd);
+        if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
+    }
+    else if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
 
     if (S)
     {
@@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_()
     
     Comp_AddCycles_C();
 
-    if (op & 1)
+    // special case for thumb mov being alias to add rd, rn, #0
+    if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
+    {
+        if (rd != rs)
+            MOV(32, rd, rs);
+    }
+    else if (op & 1)
         Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
         Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
@@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU()
     u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
-        Comp_AddCycles_CI(1);
+        Comp_AddCycles_CI(1); // shift by reg
     else
         Comp_AddCycles_C();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cc7a3c4..0dedb3f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -16,9 +16,6 @@ int squeezePointer(T* ptr)
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
     // we can simplify constant branches by a lot
-    // it's not completely safe to assume stuff like, which instructions to preload
-    // we'll see how it works out
-
     IrregularCycles = true;
 
     u32 newPC;
@@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     {
         ARMv5* cpu9 = (ARMv5*)CurCPU;
 
-        u32 oldregion = R15 >> 24;
-        u32 newregion = addr >> 24;
-
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
         u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
-
-        bool setupRegion = newregion != oldregion;
-        if (setupRegion)
-            cpu9->SetupCodeMem(addr);
+        if (Exit)
+            MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
         if (addr & 0x1)
         {
@@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             cycles += cpu9->CodeCycles;
         }
 
-        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
-
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
-        if (setupRegion)
-            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeRegion = codeRegion;
         cpu7->CodeCycles = codeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        if (Exit)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        }
 
         if (addr & 0x1)
         {
@@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = addr >> 15;
     }
 
-    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    if (Exit)
+        MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
@@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
+    Comp_SpecialBranchBehaviour();
+
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+
     Comp_AddCycles_C(true);
-   SetJumpTarget(skipFailed);
+    SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d8ce1aa..25c55a3 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -72,12 +72,15 @@ Compiler::Compiler()
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
-        {
             MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
-            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
-        }
     }
+    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
+    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
+    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
+    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
+    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
+    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
+
     for (int i = 0; i < 2; i++)
         for (int j = 0; j < 2; j++)
         {
@@ -179,12 +182,13 @@ void Compiler::LoadCPSR()
     MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
 }
 
-void Compiler::SaveCPSR()
+void Compiler::SaveCPSR(bool flagClean)
 {
     if (CPSRDirty)
     {
         MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
-        CPSRDirty = false;
+        if (flagClean)
+            CPSRDirty = false;
     }
 }
 
@@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
 // invalidates RSCRATCH and RSCRATCH3
 Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
+    // hack, ldm/stm can get really big TODO: make this better
+    bool ldmStm = !Thumb &&
+        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
     if (cond >= 0x8)
     {
         static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
@@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
         SHL(32, R(RSCRATCH), R(RSCRATCH3));
         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
 
-        return J_CC(CC_Z);
+        return J_CC(CC_Z, ldmStm);
     }
     else
     {
         // could have used a LUT, but then where would be the fun?
         TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
     }
 }
 
@@ -354,25 +361,34 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+void Compiler::Comp_SpecialBranchBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
-        InvalidateBlockCache();
+        ResetBlockCache();
 
     ConstantCycles = 0;
-    Thumb = cpu->CPSR & 0x20;
+    Thumb = thumb;
     Num = cpu->Num;
-    CodeRegion = cpu->CodeRegion;
+    CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
-    if (!(Num == 0 
-        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
-    {
-        printf("Trying to compile a block in unmapped memory\n");
-    }
+    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
@@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
-    // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
+        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
         bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
-        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-
             if (comp == NULL)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
+
                 SaveCPSR();
+            }
         }
-        
+
         if (comp != NULL)
             RegCache.Prepare(Thumb, i);
         else
@@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
             }
             else
                 (this->*comp)();
@@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 }
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+            }
             else
             {
                 IrregularCycles = false;
@@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
 
-                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
                 }
                 else
                     (this->*comp)();
 
+                Comp_SpecialBranchBehaviour();
+
                 if (CurInstr.Cond() < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipFailed = J();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C(true);
 
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            RegCache.PrepareExit();
+                            SaveCPSR(false);
+                            
+                            MOV(32, R(RAX), Imm32(ConstantCycles));
+                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+                            RET();
+                        }
+
                         SetJumpTarget(skipFailed);
                     }
                     else
@@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
+    /*FILE* codeout = fopen("codeout", "a");
+    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
+    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
+
+    fclose(codeout);*/
+
     return res;
 }
 
@@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
 }
 
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
+
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if (!Thumb && CurInstr.Cond() < 0xE)
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index fcb2380..792ff66 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,6 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
+#include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
 namespace ARMJIT
@@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+struct ComplexOperand
+{
+    ComplexOperand()
+    {}
+
+    ComplexOperand(u32 imm)
+        : IsImm(true), Imm(imm)
+    {}
+    ComplexOperand(int reg, int op, int amount)
+        : IsImm(false)
+    {
+        Reg.Reg = reg;
+        Reg.Op = op;
+        Reg.Amount = amount;
+    }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            int Reg, Op, Amount;
+        } Reg;
+        u32 Imm;
+    };
+};
 
 class Compiler : public Gen::XEmitter
 {
@@ -24,7 +51,7 @@ public:
 
     void Reset();
 
-    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -39,6 +66,8 @@ public:
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
     void Comp_AddCycles_CI(Gen::X64Reg i, int add);
+    void Comp_AddCycles_CDI();
+    void Comp_AddCycles_CD();
 
     enum
     {
@@ -92,8 +121,17 @@ public:
     void T_Comp_BL_LONG_2();
     void T_Comp_BL_Merged();
 
-    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -105,8 +143,9 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void Comp_SpecialBranchBehaviour();
+
     void* Gen_MemoryRoutine9(bool store, int size);
-    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
     void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
@@ -117,10 +156,9 @@ public:
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
-    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
-    void SaveCPSR();
+    void SaveCPSR(bool flagClean = true);
 
     bool FlagsNZRequired()
     { return CurInstr.SetFlags & 0xC; }
@@ -139,10 +177,11 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool Exit;
     bool IrregularCycles;
 
     void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2][2];
+    void* MemoryFuncs7[3][2];
 
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index bf8280d..13ca415 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -27,51 +27,7 @@ int squeezePointer(T* ptr)
 /*
     address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-    code cycles - ABI_PARAM3
 */
-
-#define CALC_CYCLES_9(numC, numD, scratch) \
-    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
-    CMP(32, R(numC), R(numD)); \
-    CMOVcc(32, numD, R(numC), CC_G); \
-    CMP(32, R(numD), R(scratch)); \
-    CMOVcc(32, scratch, R(numD), CC_G); \
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
-#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        LEA(32, scratch, MRegSum(numD, numC)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        if (!store) \
-            ADD(32, R(numC), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        if (!store) \
-            ADD(32, R(numD), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
     FixupBranch insideITCM = J_CC(CC_B);
 
-    // cycle counting!
-    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
-    SHR(32, R(ABI_PARAM4), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
-    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-
     if (store)
     {
         if (size > 8)
@@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
     {
         MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+        
+        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
+        static_assert(sizeof(AddressRange) == 16);
+        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        JMP((u8*)InvalidateByAddr, true);
+        SetJumpTarget(noCode);
     }
     else
     {
@@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
-{
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
-    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
-    if (store)
-    {
-        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
-    }
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM7Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM7Read8, true);
-    }
-
-    return res;
-}
-
 #define MEMORY_SEQ_WHILE_COND \
         if (!store) \
             MOV(32, currentElement, R(EAX));\
@@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     ABI_PARAM1 address
     ABI_PARAM2 address where registers are stored
     ABI_PARAM3 how many values to read/write
-    ABI_PARAM4 code cycles
 
     Dolphin x64CodeEmitter is my favourite assembler
  */
 void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
-
-    FixupBranch finishIt1 = J();
+    RET();
 
     SetJumpTarget(insideDTCM);
     AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
@@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
-    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
-    FixupBranch finishIt2 = J();
+    RET();
 
     SetJumpTarget(insideITCM);
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
@@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     {
         MOV(32, R(ABI_PARAM4), currentElement);
         MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+
+        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
+        CALL((u8*)InvalidateByAddr);
+        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        SetJumpTarget(noCode);
     }
     else
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1));
-    MOV(32, R(ABI_PARAM2), Imm32(1));
-
-    SetJumpTarget(finishIt1);
-    SetJumpTarget(finishIt2);
-
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
-
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
-
-    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
     RET();
 
     return res;
@@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
 void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+    RET();
 
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
+    return res;
+}
 
-    // TODO: optimise this
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
+#undef MEMORY_SEQ_WHILE_COND
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+        CurCPU->DataRead16(addr & ~0x1, &val);
+    else
+        CurCPU->DataRead8(addr, &val);
+    CurCPU->R[15] = tmpR15;
 
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+    MOV(32, MapReg(rd), Imm32(val));
 
-    return res;
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+
+    Comp_AddCycles_CDI();
 }
 
-#undef CALC_CYCLES_9
-#undef MEMORY_SEQ_WHILE_COND
+void fault(u32 a, u32 b)
+{
+    printf("actually not static! %x %x\n", a, b);
+}
 
-void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
+void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    IrregularCycles = true;
+    if (flags & memop_Store)
+    {
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-    if (store)
-        MOV(32, R(ABI_PARAM2), rd);
-    u32 cycles = Num
-        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-    MOV(32, R(ABI_PARAM3), Imm32(cycles));
-    CALL(Num == 0
-        ? MemoryFuncs9[size >> 4][store]
-        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
 
-    if (!store)
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
     {
-        if (signExtend)
-            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        Comp_MemLoadLiteral(size, rd, 
+            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+    }
+    else
+    {
+        OpArg rdMapped = MapReg(rd);
+        OpArg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memoryFunc = Num == 0
+            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
+            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
+
+        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
+            MOV(32, R(ABI_PARAM1), Imm32(R15));
+            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            CMP(32, R(RSCRATCH), Imm32(addr));
+            FixupBranch eq = J_CC(CC_E);
+            CALL((void*)fault);
+            SetJumpTarget(eq);*/
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                if (flags & memop_Store)
+                {
+                    MOV(size, M(ptr), MapReg(rd));
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+
+                    if (size == 32 && addr & ~0x3)
+                    {
+                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                    }
+                }
+
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memoryFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        X64Reg finalAddr = ABI_PARAM1;
+        if (flags & memop_Post)
+        {
+            MOV(32, R(ABI_PARAM1), rnMapped);
+
+            finalAddr = rnMapped.GetSimpleReg();
+        }
+
+        if (op2.IsImm)
+        {
+            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+        }
         else
-            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        {
+            OpArg rm = MapReg(op2.Reg.Reg);
+
+            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            {
+                LEA(32, finalAddr, 
+                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+            }
+            else
+            {
+                bool throwAway;
+                OpArg offset =
+                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+                
+                if (flags & memop_SubtractOffset)
+                {
+                    MOV(32, R(finalAddr), rnMapped);
+                    if (!offset.IsZero())
+                        SUB(32, R(finalAddr), offset);
+                }
+                else
+                    MOV_sum(32, finalAddr, rnMapped, offset);
+            }
+        }
+
+        if ((flags & memop_Writeback) && !(flags & memop_Post))
+            MOV(32, rnMapped, R(finalAddr));
+
+        if (flags & memop_Store)
+            MOV(32, R(ABI_PARAM2), rdMapped);
+
+        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
+            MOV(32, rdMapped, R(ABI_PARAM1));
+
+        if (inlinePreparation && size > 8)
+            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+
+        CALL(memoryFunc);
+
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    static_assert(RSCRATCH3 == ECX);
+                    MOV(32, R(ECX), rdMapped);
+                    AND(32, R(ECX), Imm8(3));
+                    SHL(32, R(ECX), Imm8(3));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else if (constLocalROR32 != 0)
+                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
+            }
+
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+
+        if (!(flags & memop_Store) && rd == 15)
+        {
+            if (size < 32)
+                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            {
+                if (Num == 1)
+                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
+                Comp_JumpTo(rdMapped.GetSimpleReg());
+            }
+        }
     }
 }
 
@@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    u32 cycles = Num
-            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-
     // we need to make sure that the stack stays aligned to 16 bytes
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 
-    MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        Comp_AddCycles_CDI();
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        Comp_AddCycles_CD();
+
         if (regsCount & 1)
             PUSH(RSCRATCH);
 
@@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     return offset;
 }
 
-OpArg Compiler::A_Comp_GetMemWBOffset()
-{
-    if (!(CurInstr.Instr & (1 << 25)))
-    {
-        u32 imm = CurInstr.Instr & 0xFFF;
-        return Imm32(imm);
-    }
-    else
-    {
-        int op = (CurInstr.Instr >> 5) & 0x3;
-        int amount = (CurInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurInstr.A_Reg(0));
-        bool carryUsed;
-
-        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
-    }
-}
 
 void Compiler::A_Comp_MemWB()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
     bool load = CurInstr.Instr & (1 << 20);
     bool byte = CurInstr.Instr & (1 << 22);
     int size = byte ? 8 : 32;
+    
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
 
-    if (CurInstr.Instr & (1 << 24))
+    ComplexOperand offset;
+    if (!(CurInstr.Instr & (1 << 25)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
+        offset = ComplexOperand(CurInstr.Instr & 0xFFF);
     }
     else
-        MOV(32, R(ABI_PARAM1), rn);
-
-    if (!(CurInstr.Instr & (1 << 24)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        int rm = CurInstr.A_Reg(0);
 
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
+        offset = ComplexOperand(rm, op, amount);
     }
 
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
-    if (load && CurInstr.A_Reg(12) == 15)
-    {
-        if (byte)
-            printf("!!! LDRB PC %08X\n", R15);
-        else
-        {
-            if (Num == 1)
-                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
-            Comp_JumpTo(rd.GetSimpleReg());
-        }
-    }
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::A_Comp_MemHalf()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
-
-    OpArg offset = CurInstr.Instr & (1 << 22)
-        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
-        : MapReg(CurInstr.A_Reg(0));
+    ComplexOperand offset = CurInstr.Instr & (1 << 22)
+        ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : ComplexOperand(CurInstr.A_Reg(0), 0, 0);
 
     int op = (CurInstr.Instr >> 5) & 0x3;
     bool load = CurInstr.Instr & (1 << 20);
@@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf()
     if (size == 32 && Num == 1)
         return; // NOP
 
-    if (CurInstr.Instr & (1 << 24))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-        
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
-    }
-    else
-        MOV(32, R(ABI_PARAM1), rn);
-
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
     if (!(CurInstr.Instr & (1 << 24)))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
-    }
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
-
-    if (load && CurInstr.A_Reg(12) == 15)
-        printf("!!! MemHalf op PC %08X\n", R15);;
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
     bool byte = op & 0x1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::A_Comp_LDM_STM()
@@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM()
 
 void Compiler::T_Comp_MemImm()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
     bool byte = op & 0x2;
     u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_MemRegHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op != 0;
     int size = op != 1 ? 16 : 8;
     bool signExtend = op & 1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
+        size, flags);
 }
 
 void Compiler::T_Comp_MemImmHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     u32 offset = (CurInstr.Instr >> 5) & 0x3E;
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 16);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
 
-    // hopefully this doesn't break
-    u32 val; CurCPU->DataRead32(addr, &val);
-    MOV(32, rd, Imm32(val));
+    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
 }
 
 void Compiler::T_Comp_MemSPRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) * 4;
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 32);
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_PUSH_POP()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 9239e29..0fbde26 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -36,7 +36,7 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMemory       = 1 << 20,
+    A_WriteMem          = 1 << 20
 };
 
 #define A_BIOP A_Read16
@@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(
 const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
 const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
 const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
-const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
@@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12 | A_WriteMemory
+#define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double | A_WriteMemory
+#define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 21)
+#define tk(x) ((x) << 22)
 
 enum {
     T_Read0         = 1 << 0,
@@ -210,6 +210,8 @@ enum {
     T_SetMaybeC     = 1 << 18,
     T_ReadC         = 1 << 19,
     T_SetC          = 1 << 20,
+    
+    T_WriteMem      = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
-const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
-const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
 const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
 const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
-const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
 const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
 const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
 const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
 
-const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
 const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
-const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
 const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
-const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
 const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 
-const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
-const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
@@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 21) & 0x3F;
+        res.Kind = (data >> 22) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_SetC)
             res.WriteFlags |= flag_C;
 
+        if (data & T_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
             if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
+
+            if (id == 0x704 || id == 0x782)
+                res.SpecialKind = special_WaitForInterrupt;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
         {
@@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
             res.WriteFlags |= flag_C;
 
+        if (data & A_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d01c600..d02f168 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -226,18 +226,27 @@ enum
     flag_V = 1 << 0,
 };
 
+enum
+{
+    special_NotSpecialAtAll = 0,
+    special_WriteMem,
+    special_WaitForInterrupt
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 SpecialKind;
+
     u8 ReadFlags;
     // lower 4 bits - set always
     // upper 4 bits - might set flag
     u8 WriteFlags;
 
     bool EndBlock;
-    bool Branches()
+    bool Branches() const
     {
         return DstRegs & (1 << 15);
     }
diff --git a/src/CP15.cpp b/src/CP15.cpp
index e6e91c3..10c3b1b 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+        ARMJIT::InvalidateAll();
         ICacheInvalidateAll();
         return;
     case 0x751:
+        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
diff --git a/src/Config.cpp b/src/Config.cpp
index 3cff0ed..63d61a3 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,6 +37,7 @@ int GL_Antialias;
 #ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+bool JIT_BrancheOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index c13eae3..0fcefc3 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -49,6 +49,7 @@ extern int GL_Antialias;
 #ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+extern bool JIT_BrancheOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 1baa308..e9e6795 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -536,7 +536,7 @@ void Reset()
     RCnt = 0;
 
 #ifdef JIT_ENABLED
-    ARMJIT::InvalidateBlockCache();
+    ARMJIT::ResetBlockCache();
 #endif
 
     NDSCart::Reset();
@@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file)
 #ifdef JIT_ENABLED
     if (!file->Saving)
     {
-        ARMJIT::InvalidateBlockCache();
+        ARMJIT::ResetBlockCache();
     }
 #endif
 
@@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 09ea8eb..45e8e0c 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot;
 #ifdef JIT_ENABLED
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
+uiCheckbox* cbJITBranchOptimisations;
 #endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
@@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg)
     bool enableJit = uiCheckboxChecked(cbJITEnabled);
     char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
     long blockSize = strtol(maxBlockSizeStr, NULL, 10);
+    bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
     uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
 
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize)
+    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize ||
+        branchOptimisations != Config::JIT_BrancheOptimisations)
     {
         if (RunningSomething && 
             !uiMsgBoxConfirm(win, "Reset emulator", 
@@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg)
 
         Config::JIT_Enable = enableJit;
         Config::JIT_MaxBlockSize = blockSize;
+        Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
 
         restart = true;
     }
@@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg)
 void OnJITStateChanged(uiCheckbox* cb, void* blarg)
 {
     if (uiCheckboxChecked(cb))
+    {
         uiControlEnable(uiControl(enJITMaxBlockSize));
+        uiControlEnable(uiControl(cbJITBranchOptimisations));
+    }
     else
+    {
         uiControlDisable(uiControl(enJITMaxBlockSize));
+        uiControlDisable(uiControl(cbJITBranchOptimisations));
+    }
 }
 #endif
 
@@ -159,6 +169,14 @@ void Open()
             enJITMaxBlockSize = uiNewEntry();
             uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
         }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)");
+            uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
+        }
     }
 #endif
 
@@ -194,6 +212,8 @@ void Open()
         uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
     }
     OnJITStateChanged(cbJITEnabled, NULL);
+
+    uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
 #endif
 
     uiControlShow(uiControl(win));
-- 
cgit v1.2.3


From 81f38c14be0d9ba5a3da8f67d9719ed2c47279c5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 13:29:17 +0200
Subject: integrate changes from ARM64 backend and more - better handle LDM/STM
 in reg alloc - unify Halted and IRQ in anticipation for branch inlining -
 literal optimisations can be disabled in gui - jit blocks follow simple
 returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in
 Pokemon White)

---
 src/ARM.cpp                         | 40 ++++++++++++++++++-----------
 src/ARM.h                           | 13 +++++++---
 src/ARMJIT.cpp                      | 50 +++++++++++++++++++++++++++++++------
 src/ARMJIT_RegisterCache.h          | 33 +++++++++++++++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  7 +++---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++----
 src/ARM_InstrInfo.cpp               | 28 +++++++++++++++++++++
 src/ARM_InstrInfo.h                 |  2 +-
 src/Config.cpp                      |  2 ++
 src/Config.h                        |  1 +
 src/NDS.cpp                         |  2 +-
 src/libui_sdl/DlgEmuSettings.cpp    | 31 ++++++++++++++++++++---
 src/libui_sdl/main.cpp              |  2 --
 13 files changed, 179 insertions(+), 48 deletions(-)

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 423c940..4fab60e 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -113,7 +113,7 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&Halted);
+    file->Var32(&StopExecution);
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
@@ -589,16 +589,21 @@ void ARMv5::ExecuteJIT()
         NDS::ARM9Timestamp += Cycles;
         Cycles = 0;
 
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM9Timestamp = NDS::ARM9Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+                {
+                    NDS::ARM9Timestamp = NDS::ARM9Target;
+                }
+                break;
             }
-            break;
         }
     }
 
@@ -726,16 +731,21 @@ void ARMv4::ExecuteJIT()
         Cycles = 0;
 
         // TODO optimize this shit!!!
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM7Timestamp = NDS::ARM7Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+                {
+                    NDS::ARM7Timestamp = NDS::ARM7Target;
+                }
+                break;
             }
-            break;
         }
     }
 
diff --git a/src/ARM.h b/src/ARM.h
index 8a01068..e252d23 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -112,9 +112,16 @@ public:
     u32 Num;
 
     s32 Cycles;
-    u32 Halted;
-
-    u32 IRQ; // nonzero to trigger IRQ
+    union
+    {
+        struct
+        {
+            u8 Halted;
+            u8 IRQ; // nonzero to trigger IRQ
+            u8 IdleLoop;
+        };
+        u32 StopExecution;
+    };
 
     u32 CodeRegion;
     s32 CodeCycles;
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 19a5e70..0695b85 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -16,11 +16,13 @@
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
+#include "NDSCart.h"
 
 namespace ARMJIT
 {
 
 #define JIT_DEBUGPRINT(msg, ...)
+//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
 Compiler* compiler;
 
@@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
+	u32& linkAddr, u32& targetAddr)
 {
 	if (thumb)
 	{
 		u32 r15 = instr.Addr + 4;
 		cond = 0xE;
 
+		link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
+		linkAddr = instr.Addr + 4;
+
 		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
 		{
 			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
@@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	else
 	{
+		link = instr.Info.Kind == ARMInstrInfo::ak_BL;
+		linkAddr = instr.Addr + 4;
+
 		cond = instr.Cond();
 		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
 			|| instr.Info.Kind == ARMInstrInfo::ak_B)
@@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	return false;
 }
@@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
+	u32 lr;
+	bool hasLink = false;
 
     do
     {
@@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
+		if (instrs[i].Info.DstRegs & (1 << 14))
+			hasLink = false;
+
 		if (thumb)
 		{
 			InterpretTHUMB[instrs[i].Info.Kind](cpu);
@@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
-			u32 cond, target;
-			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			bool link;
+			u32 cond, target, linkAddr;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
 			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
 
 			if (staticBranch)
@@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu)
 				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
 				{
 					// we might have an idle loop
-					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
-					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
+					if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
 					{
 						instrs[i].BranchFlags |= branch_IdleBranch;
 						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
 					}
 				}
-				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
 						? TranslateAddr<0>(target)
 						: TranslateAddr<1>(target);
+
+					if (link)
+					{
+						lr = linkAddr;
+						hasLink = true;
+					}
 					
 					r15 = target + (thumb ? 2 : 4);
 					assert(r15 == cpu->R[15]);
@@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu)
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
 	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
 	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
@@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	{
 		if ((addr & 0xFF000000) == 0x04000000)
 		{
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
 			/*
 				unfortunately we can't map GPU2D this way
 				since it's hidden inside an object
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index ed6a2b7..2222bc2 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -93,10 +93,12 @@ public:
 
 	void Prepare(bool thumb, int i)
     {
+        FetchedInstr instr = Instrs[i];
+
         if (LoadedRegs & (1 << 15))
             UnloadRegister(15);
 
-        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
         for (int reg : invalidedLiterals)
             UnloadLiteral(reg);
 
@@ -108,6 +110,7 @@ public:
         {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
+            regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
             for (int reg : regsNeeded)
                 ranking[reg]++;
         }
@@ -117,8 +120,8 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -143,13 +146,31 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            // we don't need to load a value which is always going to be overwritten
             BitSet16 needValueLoaded(needToBeLoaded);
-            if (thumb || Instr.Cond() >= 0xE)
-                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
+            if (thumb || instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
+        } 
+        {
+            BitSet16 loadedSet(LoadedRegs);
+            BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
+            if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
+            {
+                int left = NativeRegsAvailable - loadedSet.Count();
+                for (int reg : loadRegs)
+                {
+                    if (left-- == 0)
+                        break;
+
+                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
+                    LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
+                }
+            }
         }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+
+        DirtyRegs |= writeRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index a994d34..fd38724 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -364,7 +364,7 @@ void Compiler::Reset()
 void Compiler::Comp_SpecialBranchBehaviour()
 {
     if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
     if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
     {
@@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     {
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
 
         Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
 
@@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
-        IrregularCycles = true;
-
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
@@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD()
         IrregularCycles = true;
     }
 
-    if (!Thumb && CurInstr.Cond() < 0xE)
+    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index eb01c87..3799774 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,6 @@
 #include "ARMJIT_Compiler.h"
 
+#include "../Config.h"
 
 using namespace Gen;
 
@@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         Comp_MemLoadLiteral(size, rd, addr);
@@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
+        if (Thumb && rn == 15)
+            rnMapped = Imm32(R15 & ~0x2);
 
         bool inlinePreparation = Num == 1;
         u32 constLocalROR32 = 4;
@@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
             : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
         {
             u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
@@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
-
-    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    u32 addr = (R15 & ~0x2) + offset;
+    if (Config::JIT_LiteralOptimisations)
+        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    else
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 1261bbe..8f8bd35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
             res.SpecialKind = special_LoadLiteral;
 
+        if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            res.NotStrictlyNeeded |= set;
+            res.DstRegs |= set;
+        }
+        if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            if (res.Kind == tk_PUSH && instr & (1 << 8))
+                set |= (1 << 14);
+            res.NotStrictlyNeeded |= set;
+            res.SrcRegs |= set;
+        }
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
             res.SpecialKind = special_LoadLiteral;
+        
+        if (res.Kind == ak_LDM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.DstRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
+        if (res.Kind == ak_STM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.SrcRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
 
         if ((instr >> 28) < 0xE)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index c032a4f..2732181 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -236,7 +236,7 @@ enum
 
 struct Info
 {
-    u16 DstRegs, SrcRegs;
+    u16 DstRegs, SrcRegs, NotStrictlyNeeded;
     u16 Kind;
 
     u8 SpecialKind;
diff --git a/src/Config.cpp b/src/Config.cpp
index 63d61a3..eb5bfcc 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -38,6 +38,7 @@ int GL_Antialias;
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
 bool JIT_BrancheOptimisations = true;
+bool JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -52,6 +53,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 0fcefc3..723ab13 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -50,6 +50,7 @@ extern int GL_Antialias;
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern bool JIT_BrancheOptimisations;
+extern bool JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index e9e6795..141c565 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1142,7 +1142,7 @@ void UpdateIRQ(u32 cpu)
 
     if (IME[cpu] & 0x1)
     {
-        arm->IRQ = IE[cpu] & IF[cpu];
+        arm->IRQ = !!(IE[cpu] & IF[cpu]);
     }
     else
     {
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
index 45e8e0c..0df9c6c 100644
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ b/src/libui_sdl/DlgEmuSettings.cpp
@@ -43,6 +43,7 @@ uiCheckbox* cbDirectBoot;
 uiCheckbox* cbJITEnabled;
 uiEntry* enJITMaxBlockSize;
 uiCheckbox* cbJITBranchOptimisations;
+uiCheckbox* cbJITLiteralOptimisations;
 #endif
 
 int OnCloseWindow(uiWindow* window, void* blarg)
@@ -66,14 +67,16 @@ void OnOk(uiButton* btn, void* blarg)
     char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
     long blockSize = strtol(maxBlockSizeStr, NULL, 10);
     bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
+    bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations);
     uiFreeText(maxBlockSizeStr);
     if (blockSize < 1)
         blockSize = 1;
     if (blockSize > 32)
         blockSize = 32;
 
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize ||
-        branchOptimisations != Config::JIT_BrancheOptimisations)
+    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize
+        || branchOptimisations != Config::JIT_BrancheOptimisations
+        || literalOptimisations != Config::JIT_LiteralOptimisations)
     {
         if (RunningSomething && 
             !uiMsgBoxConfirm(win, "Reset emulator", 
@@ -82,7 +85,8 @@ void OnOk(uiButton* btn, void* blarg)
 
         Config::JIT_Enable = enableJit;
         Config::JIT_MaxBlockSize = blockSize;
-        Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
+        Config::JIT_BrancheOptimisations = branchOptimisations;
+        Config::JIT_LiteralOptimisations = literalOptimisations;
 
         restart = true;
     }
@@ -108,11 +112,13 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg)
     {
         uiControlEnable(uiControl(enJITMaxBlockSize));
         uiControlEnable(uiControl(cbJITBranchOptimisations));
+        uiControlEnable(uiControl(cbJITLiteralOptimisations));
     }
     else
     {
         uiControlDisable(uiControl(enJITMaxBlockSize));
         uiControlDisable(uiControl(cbJITBranchOptimisations));
+        uiControlDisable(uiControl(cbJITLiteralOptimisations));
     }
 }
 #endif
@@ -174,9 +180,25 @@ void Open()
             uiBox* row = uiNewHorizontalBox();
             uiBoxAppend(in_ctrl, uiControl(row), 0);
 
-            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)");
+            uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:");
+            uiBoxAppend(row, uiControl(lbl), 0);
+        }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations");
             uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
         }
+
+        {
+            uiBox* row = uiNewHorizontalBox();
+            uiBoxAppend(in_ctrl, uiControl(row), 0);
+
+            cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations");
+            uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0);
+        }
     }
 #endif
 
@@ -214,6 +236,7 @@ void Open()
     OnJITStateChanged(cbJITEnabled, NULL);
 
     uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
+    uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations);
 #endif
 
     uiControlShow(uiControl(win));
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
index c3db88d..0066668 100644
--- a/src/libui_sdl/main.cpp
+++ b/src/libui_sdl/main.cpp
@@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl)
 
 int main(int argc, char** argv)
 {
-    freopen("miauz.txt", "w", stdout);
-
     srand(time(NULL));
 
     printf("melonDS " MELONDS_VERSION "\n");
-- 
cgit v1.2.3


From 3787bab1f69ae22d3e8106d70598ce923e5efe70 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 13:40:51 +0200
Subject: implement block linking + some refactoring currently only supported
 for x64

---
 .gitignore                           |    2 +
 src/ARM.cpp                          |   37 +-
 src/ARM.h                            |   32 +-
 src/ARMJIT.cpp                       |  223 +++-
 src/ARMJIT.h                         |   10 +-
 src/ARMJIT_Internal.h                |   24 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp     |   23 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp   |  140 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h     |   19 +-
 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp |   15 +
 src/ARMJIT_x64/ARMJIT_Linkage.s      |   74 ++
 src/ARMJIT_x64/ARMJIT_Offsets.h      |    3 +
 src/CMakeLists.txt                   |    7 +
 src/Config.cpp                       |    8 +-
 src/Config.h                         |    6 +-
 src/xxhash/xxh3.h                    | 2390 ++++++++++++++++++++++++++++++++++
 src/xxhash/xxhash.c                  |   43 +
 src/xxhash/xxhash.h                  | 1965 ++++++++++++++++++++++++++++
 18 files changed, 4871 insertions(+), 150 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h
 create mode 100644 src/xxhash/xxh3.h
 create mode 100644 src/xxhash/xxhash.c
 create mode 100644 src/xxhash/xxhash.h

(limited to 'src/Config.h')

diff --git a/.gitignore b/.gitignore
index dd81614..3c87740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ melon_grc.h
 cmake-build
 cmake-build-debug
 .idea
+
+*.exe
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 9ab9546..32cb91c 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -206,15 +206,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (addr & 0x2)
         {
             NextInstr[0] = CodeRead32(addr-2, true) >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
             NextInstr[1] = CodeRead32(addr+2, false);
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
         else
         {
             NextInstr[0] = CodeRead32(addr, true);
             NextInstr[1] = NextInstr[0] >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
 
         CPSR |= 0x20;
@@ -227,9 +227,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (newregion != oldregion) SetupCodeMem(addr);
 
         NextInstr[0] = CodeRead32(addr, true);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
         NextInstr[1] = CodeRead32(addr+4, false);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
 
         CPSR &= ~0x20;
     }
@@ -272,7 +272,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead16(addr);
         NextInstr[1] = CodeRead16(addr+2);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
 
         CPSR |= 0x20;
     }
@@ -285,7 +285,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead32(addr);
         NextInstr[1] = CodeRead32(addr+4);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
 
         CPSR &= ~0x20;
     }
@@ -544,7 +544,7 @@ void ARMv5::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM9Timestamp += Cycles;
+        NDS::ARM9Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -584,14 +584,16 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
 
         if (StopExecution)
         {
@@ -685,7 +687,7 @@ void ARMv4::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM7Timestamp += Cycles;
+        NDS::ARM7Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -725,14 +727,15 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index 7767095..4877956 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -185,14 +185,14 @@ public:
     {
         // code only. always nonseq 32-bit for ARM9.
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC;
+        Cycles -= numC;
     }
 
     void AddCycles_CI(s32 numI)
     {
         // code+internal
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC + numI;
+        Cycles -= numC + numI;
     }
 
     void AddCycles_CDI()
@@ -203,9 +203,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void AddCycles_CD()
@@ -215,9 +215,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void GetCodeMemRegion(u32 addr, NDS::MemRegion* region);
@@ -375,13 +375,13 @@ public:
     void AddCycles_C()
     {
         // code only. this code fetch is sequential.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
     }
 
     void AddCycles_CI(s32 num)
     {
         // code+internal. results in a nonseq code fetch.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
     }
 
     void AddCycles_CDI()
@@ -393,21 +393,21 @@ public:
         if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
             {
                 numC++;
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
             }
         }
         else if (CodeRegion == 0x02)
         {
             numD++;
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD + 1;
+            Cycles -= numC + numD + 1;
         }
     }
 
@@ -420,17 +420,17 @@ public:
         if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else if (CodeRegion == 0x02)
         {
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD;
+            Cycles -= numC + numD;
         }
     }
 };
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 208801e..cc8d4ce 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,10 @@
 
 #include <string.h>
 #include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
 
 #include "Config.h"
 
@@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = {
 u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
-JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-TinyVector<JitBlock*> JitBlocks;
-JitBlock* RestoreCandidates[0x1000] = {NULL};
+std::unordered_map<u32, JitBlock*> JitBlocks;
 
-u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
+template <typename K, typename V, int Size, V InvalidValue>
+struct UnreliableHashTable
 {
-	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
-}
+	struct Bucket
+	{
+		K KeyA, KeyB;
+		V ValA, ValB;
+	};
+
+	Bucket Table[Size];
+
+	void Reset()
+	{
+		for (int i = 0; i < Size; i++)
+		{
+			Table[i].ValA = Table[i].ValB = InvalidValue;
+		}
+	}
+
+	UnreliableHashTable()
+	{
+		Reset();
+	}
+
+	V Insert(K key, V value)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA == value || bucket->ValB == value)
+		{
+			return InvalidValue;
+		}
+		else if (bucket->ValA == InvalidValue)
+		{
+			bucket->KeyA = key;
+			bucket->ValA = value;
+		}
+		else if (bucket->ValB == InvalidValue)
+		{
+			bucket->KeyB = key;
+			bucket->ValB = value;
+		}
+		else
+		{
+			V prevVal = bucket->ValB;
+			bucket->KeyB = bucket->KeyA;
+			bucket->ValB = bucket->ValA;
+			bucket->KeyA = key;
+			bucket->ValA = value;
+			return prevVal;
+		}
+
+		return InvalidValue;
+	}
+
+	void Remove(K key)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->KeyA == key && bucket->ValA != InvalidValue)
+		{
+			bucket->ValA = InvalidValue;
+			if (bucket->ValB != InvalidValue)
+			{
+				bucket->KeyA = bucket->KeyB;
+				bucket->ValA = bucket->ValB;
+				bucket->ValB = InvalidValue;
+			}
+		}
+		if (bucket->KeyB == key && bucket->ValB != InvalidValue)
+			bucket->ValB = InvalidValue;
+	}
+
+	V LookUp(K addr)
+	{
+		u32 slot = XXH3_64bits(&addr, 4) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA != InvalidValue && bucket->KeyA == addr)
+			return bucket->ValA;
+		if (bucket->ValB != InvalidValue && bucket->KeyB == addr)
+			return bucket->ValB;
+
+		return InvalidValue;
+	}
+};
+
+UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
+UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
 
 void Init()
 {
@@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
-		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
@@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu)
 
 			if (staticBranch)
 			{
+				instrs[i].BranchFlags |= branch_StaticTarget;
+
 				bool isBackJump = false;
 				if (hasBranched)
 				{
@@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
-	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
-	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
-	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	if (prevBlock)
 	{
-		RestoreCandidates[restoreSlot] = NULL;	
+		RestoreCandidates.Remove(pseudoPhysicalAddr);
 		if (prevBlock->NumInstrs == i)
 		{
 			for (int j = 0; j < i; j++)
@@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu)
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
-	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
-
-	JitBlocks.Add(block);
+	JitBlocks[pseudoPhysicalAddr] = block;
+	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
 }
 
 void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
@@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 			}
 		}
 
-		bool removed = JitBlocks.RemoveByValue(block);
-		assert(removed);
+		for (int j = 0; j < block->NumLinks(); j++)
+			compiler->UnlinkBlock(block->Links()[j]);
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		JitBlocks.erase(block->PseudoPhysicalAddr);
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
 
 		if (mayRestore)
 		{
-			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-				delete RestoreCandidates[slot];
-
-			RestoreCandidates[slot] = block;
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			if (prevBlock)
+				delete prevBlock;
 		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
@@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr)
 void InvalidateAll()
 {
 	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
-		
-		for (int j = 0; j < block->NumAddresses; j++)
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+
+		for (int i = 0; i < block->NumAddresses; i++)
 		{
-			u32 addr = block->AddressRanges()[j];
+			u32 addr = block->AddressRanges()[i];
 			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
 		}
+		for (int i = 0; i < block->NumLinks(); i++)
+			compiler->UnlinkBlock(block->Links()[i]);
+		block->ResetLinks();
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
-		
-		RestoreCandidates[slot] = block;
+		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+		if (prevBlock)
+			delete prevBlock;
 	}
 
-	JitBlocks.Clear();
+	JitBlocks.clear();
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
-	
-	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
-	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+
+	FastBlockLookUp.Reset();
+	RestoreCandidates.Reset();
+	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
-		if (RestoreCandidates[i])
+		if (RestoreCandidates.Table[i].ValA)
 		{
-			delete RestoreCandidates[i];
-			RestoreCandidates[i] = NULL;
+			delete RestoreCandidates.Table[i].ValA;
+			RestoreCandidates.Table[i].ValA = NULL;
+		}
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValB;
+			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -788,11 +882,43 @@ void ResetBlockCache()
 		}
 		delete block;
 	}
-	JitBlocks.Clear();
+	JitBlocks.clear();
 
 	compiler->Reset();
 }
 
+JitBlockEntry LookUpBlockEntry(u32 addr)
+{
+	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	if (entryOffset != UINT32_MAX)
+		return compiler->AddEntryOffset(entryOffset);
+
+	auto block = JitBlocks.find(addr);
+	if (block != JitBlocks.end())
+	{
+		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		return block->second->EntryPoint;
+	}
+	return NULL;
+}
+
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset)
+{
+	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
+	auto block = JitBlocks.find(targetPseudoPhys);
+	if (block == JitBlocks.end())
+	{
+		CompileBlock(cpu);
+		block = JitBlocks.find(targetPseudoPhys);
+	}
+
+	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
+
+	block->second->AddLink(codeOffset);
+	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
@@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	return NULL;
 }
 
-}
\ No newline at end of file
+}
+
+template void ARMJIT::LinkBlock<0>(ARM*, u32);
+template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 09cc463..cab385f 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000];
 extern u32 AddrTranslate7[0x4000];
 
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
@@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr)
 		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
-template <u32 num>
-inline JitBlockEntry LookUpBlock(u32 addr)
-{
-	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
-}
+JitBlockEntry LookUpBlockEntry(u32 addr);
+
 
 void Init();
 void DeInit();
@@ -73,4 +69,6 @@ void ResetBlockCache();
 
 }
 
+extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
+
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 0d6add9..66d1808 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -15,7 +15,8 @@ enum
 {
 	branch_IdleBranch = 1 << 0,
 	branch_FollowCondTaken = 1 << 1,
-	branch_FollowCondNotTaken = 1 << 2
+	branch_FollowCondNotTaken = 1 << 2,
+	branch_StaticTarget = 1 << 3,
 };
 
 struct FetchedInstr
@@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector
 		assert(capacity > Capacity);
 		T* newMem = new T[capacity];
 		if (Data != NULL)
-			memcpy(newMem, Data, sizeof(Data) * Length);
+			memcpy(newMem, Data, sizeof(T) * Length);
 
 		T* oldData = Data;
 		Data = newMem;
@@ -163,7 +164,6 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
-	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
@@ -171,6 +171,21 @@ public:
 	{ return &Data[0]; }
 	u32* AddressRanges()
 	{ return &Data[NumInstrs]; }
+	u32* Links()
+	{ return &Data[NumInstrs + NumAddresses]; }
+
+	u32 NumLinks()
+	{ return Data.Length - NumInstrs - NumAddresses; }
+
+	void AddLink(u32 link)
+	{
+		Data.Add(link);
+	}
+
+	void ResetLinks()
+	{
+		Data.SetLength(NumInstrs + NumAddresses);
+	}
 
 private:
 	/*
@@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000];
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index e02865d..cac590a 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
@@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     IrregularCycles = true;
 
     BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-    bool previouslyDirty = CPSRDirty;
+    bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
@@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
             LoadReg(reg, RegCache.Mapping[reg]);
     }
 
-    if (previouslyDirty)
-        LoadCPSR();
-    CPSRDirty = previouslyDirty;
+    LoadCPSR();
+    // in case this instruction is skipped
+    if (CurInstr.Cond() < 0xE)
+        CPSRDirty = cpsrDirty;
 }
 
 void Compiler::A_Comp_BranchImm()
@@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_SpecialBranchBehaviour();
+    Comp_SpecialBranchBehaviour(true);
 
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
-    }
+    Comp_SpecialBranchBehaviour(false);
 
     Comp_AddCycles_C(true);
     SetJumpTarget(skipFailed);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d69bdff..be3709e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -1,6 +1,7 @@
 #include "ARMJIT_Compiler.h"
 
 #include "../ARMInterpreter.h"
+#include "../Config.h"
 
 #include <assert.h>
 
@@ -15,6 +16,8 @@
 
 using namespace Gen;
 
+extern "C" void ARM_Ret();
+
 namespace ARMJIT
 {
 template <>
@@ -170,6 +173,24 @@ Compiler::Compiler()
         RET();
     }
 
+    {
+        CPSRDirty = true;
+        BranchStub[0] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<0>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+
+        CPSRDirty = true;
+        BranchStub[1] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<1>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -362,23 +383,43 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-void Compiler::Comp_SpecialBranchBehaviour()
+void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
         RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
+
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
+            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
+        {
+            FixupBranch ret = J_CC(CC_S);
+            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+            FixupBranch ret2 = J_CC(CC_NZ);
+
+            u8* rewritePart = GetWritableCodePtr();
+            NOP(5);
+
+            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+            JMP((u8*)BranchStub[Num], true);
+
+            SetJumpTarget(ret);
+            SetJumpTarget(ret2);
+            JMP((u8*)ARM_Ret, true);
+        }
+        else
+        {
+            JMP((u8*)&ARM_Ret, true);
+        }
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         ResetBlockCache();
@@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     Num = cpu->Num;
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
+    // CPSR might have been modified in a previous block
+    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-
-    MOV(64, R(RCPU), ImmPtr(cpu));
-
-    LoadCPSR();
-
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                 else
                     (this->*comp)();
 
-                Comp_SpecialBranchBehaviour();
+                Comp_SpecialBranchBehaviour(true);
 
                 if (CurInstr.Cond() < 0xE)
                 {
@@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
                         Comp_AddCycles_C(true);
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            RegCache.PrepareExit();
-                            SaveCPSR(false);
-                            
-                            MOV(32, R(RAX), Imm32(ConstantCycles));
-                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-                            RET();
-                        }
+                        Comp_SpecialBranchBehaviour(false);
 
                         SetJumpTarget(skipFailed);
                     }
@@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
             }
         }
 
-        if (comp == NULL && i != instrsCount - 1)
+        if (comp == NULL)
             LoadCPSR();
     }
 
     RegCache.Flush();
-    SaveCPSR();
 
-    MOV(32, R(RAX), Imm32(ConstantCycles));
+    SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+    if (Config::JIT_BrancheOptimisations == 2
+        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
+        && (!instrs[instrsCount - 1].Info.Branches()
+        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
+        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
+    {
+        FixupBranch ret = J_CC(CC_S);
+        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+        FixupBranch ret2 = J_CC(CC_NZ);
+
+        u8* rewritePart = GetWritableCodePtr();
+        NOP(5);
+
+        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+        JMP((u8*)BranchStub[Num], true);
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-    RET();
+        SetJumpTarget(ret);
+        SetJumpTarget(ret2);
+        JMP((u8*)ARM_Ret, true);
+    }
+    else
+    {
+        JMP((u8*)ARM_Ret, true);
+    }
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     return res;
 }
 
+void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    JMP((u8*)entry, true);
+    SetCodePtr(curPtr);
+}
+
+void Compiler::UnlinkBlock(u32 offset)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    NOP(5);
+    SetCodePtr(curPtr);
+}
+
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
@@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (!Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     if (!Thumb && CurInstr.Cond() < 0xE)
     {
         LEA(32, RSCRATCH, MDisp(i, add + cycles));
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
     }
     else
     {
         ConstantCycles += i + cycles;
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
 
@@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+            SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
             ConstantCycles += cycles;
     }
@@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2cb57dc..b428c33 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -51,7 +51,10 @@ public:
 
     void Reset();
 
-    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    void LinkBlock(u32 offset, JitBlockEntry entry);
+    void UnlinkBlock(u32 offset);
+
+    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -145,7 +148,7 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void Comp_SpecialBranchBehaviour();
+    void Comp_SpecialBranchBehaviour(bool taken);
 
     void* Gen_MemoryRoutine9(bool store, int size);
 
@@ -176,12 +179,24 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(ResetStart + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - ResetStart;
+    }
+
     u8* ResetStart;
     u32 CodeMemSize;
 
     bool Exit;
     bool IrregularCycles;
 
+    void* BranchStub[2];
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
new file mode 100644
index 0000000..9696d22
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -0,0 +1,15 @@
+#include "../ARM.h"
+
+int main(int argc, char* argv[])
+{
+    FILE* f = fopen("ARMJIT_Offsets.h", "w");
+#define writeOffset(field) \
+        fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field))
+
+    writeOffset(CPSR);
+    writeOffset(Cycles);
+    writeOffset(StopExecution);
+
+    fclose(f);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..dbbb024
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -0,0 +1,74 @@
+.intel_syntax noprefix
+
+#include "ARMJIT_Offsets.h"
+
+.text
+
+#define RCPU rbp
+#define RCPSR r15d
+
+#ifdef WIN64
+#define ARG1_REG ecx
+#define ARG2_REG edx
+#define ARG3_REG r8d
+#define ARG4_REG r9d
+#define ARG1_REG64 rcx
+#define ARG2_REG64 rdx
+#define ARG3_REG64 r8
+#define ARG4_REG64 r9
+#else
+#define ARG1_REG edi
+#define ARG2_REG esi
+#define ARG3_REG edx
+#define ARG4_REG ecx
+#define ARG1_REG64 rdi
+#define ARG2_REG64 rsi
+#define ARG3_REG64 rdx
+#define ARG4_REG64 rcx
+#endif
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+#ifdef WIN64
+    push rdi
+    push rsi
+#endif
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+    push rbp
+
+#ifdef WIN64
+    sub rsp, 0x28
+#endif
+    mov RCPU, ARG1_REG64
+    mov RCPSR, [RCPU + ARM_CPSR_offset]
+
+    jmp ARG2_REG64
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    mov [RCPU + ARM_CPSR_offset], RCPSR
+
+#ifdef WIN64
+    add rsp, 0x28
+#endif
+
+    pop rbp
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+#ifdef WIN64
+    pop rsi
+    pop rdi
+#endif
+
+    ret
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
new file mode 100644
index 0000000..a73dd59
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -0,0 +1,3 @@
+#define ARM_CPSR_offset 0x64
+#define ARM_Cycles_offset 0xc
+#define ARM_StopExecution_offset 0x10
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 912299d..f650f42 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,9 +30,13 @@ add_library(core STATIC
 	SPU.cpp
 	Wifi.cpp
 	WifiAP.cpp
+
+	xxhash/xxhash.c
 )
 
 if (ENABLE_JIT)
+	enable_language(ASM)
+
 	target_sources(core PRIVATE
 		ARMJIT.cpp
 
@@ -49,7 +53,10 @@ if (ENABLE_JIT)
 			ARMJIT_x64/ARMJIT_ALU.cpp
 			ARMJIT_x64/ARMJIT_LoadStore.cpp
 			ARMJIT_x64/ARMJIT_Branch.cpp
+
+			ARMJIT_x64/ARMJIT_Linkage.s
 		)
+		set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
 	endif()
 	if (ARCHITECTURE STREQUAL ARM64)
 		target_sources(core PRIVATE
diff --git a/src/Config.cpp b/src/Config.cpp
index be6a833..f3f8c6c 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -35,10 +35,10 @@ int GL_ScaleFactor;
 int GL_Antialias;
 
 #ifdef JIT_ENABLED
-bool JIT_Enable = false;
+int JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
-bool JIT_BrancheOptimisations = true;
-bool JIT_LiteralOptimisations = true;
+int JIT_BrancheOptimisations = 2;
+int JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -52,7 +52,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
diff --git a/src/Config.h b/src/Config.h
index 723ab13..fff476a 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -47,10 +47,10 @@ extern int GL_ScaleFactor;
 extern int GL_Antialias;
 
 #ifdef JIT_ENABLED
-extern bool JIT_Enable;
+extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern bool JIT_BrancheOptimisations;
-extern bool JIT_LiteralOptimisations;
+extern int JIT_BrancheOptimisations;
+extern int JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h
new file mode 100644
index 0000000..5d5faf8
--- /dev/null
+++ b/src/xxhash/xxh3.h
@@ -0,0 +1,2390 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file is separated for development purposes.
+ * It will be integrated into `xxhash.h` when development stage is completed.
+ *
+ * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
+ */
+
+#ifndef XXH3_H_1397135465
+#define XXH3_H_1397135465
+
+/* ===   Dependencies   === */
+#ifndef XXHASH_H_5627135585666179
+/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
+#  undef XXH_INLINE_ALL   /* avoid redefinition */
+#  define XXH_INLINE_ALL
+#endif
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0 /* Portable scalar version */
+#define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
+#define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
+#define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
+#define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
+#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator.
+ * This is for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+#  undef vector /* Undo the pollution */
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+ * {
+ *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+ * }
+ */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * We don't need to (or want to) mix as much as XXH64.
+ *
+ * Short hashes are more evenly distributed, so it isn't necessary.
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        xxh_u64 const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len < 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 x = input64 ^ bitflip;
+        /* this mix is inspired by Pelle Evensen's rrmxmx */
+        x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
+        x *= 0x9FB21C651E98DF25ULL;
+        x ^= (x >> 35) + len ;
+        x *= 0x9FB21C651E98DF25ULL;
+        return XXH_xorshift64(x, 28);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(8 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    __asm__ ("" : "+r" (seed64));
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
+
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        if (accWidth == XXH3_acc_128bits) {
+            /* xacc[0] += swap(data_vec); */
+            __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        } else {  /* XXH3_acc_64bits */
+            /* xacc[0] += data_vec; */
+            __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+                __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+#ifdef __s390x__
+            xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
+#else
+            xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+        size_t i;
+        for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+#ifdef __clang__ // for clang
+#  define XXH_PREFETCH_DIST_AVX512_64  320
+#  define XXH_PREFETCH_DIST_AVX512_128 320
+#else // for gcc
+#  define XXH_PREFETCH_DIST_AVX512_64  640
+#  define XXH_PREFETCH_DIST_AVX512_128 512
+#endif
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+#if (XXH_VECTOR == XXH_AVX512)
+        if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
+        else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
+#else
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+#endif
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+            /* Do not align on 8, so that the secret is different from the scrambler */
+#define XXH_SECRET_LASTACC_START 7
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        __asm__("" : "+r" (result64));
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+    /*
+     * We need a separate pointer for the hack below.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8 *kSecretPtr = kSecret;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that kSecretPtr has been changed), the pipelines are used more efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    __asm__("" : "+r" (kSecretPtr));
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == kSecret);
+
+    for (i=0; i < nbRounds; i++) {
+        /*
+         * The asm hack causes Clang to assume that kSecretPtr aliases with
+         * customSecret, and on aarch64, this prevented LDP from merging two
+         * loads together for free. Putting the loads together before the stores
+         * properly generates LDP.
+         */
+        xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+        xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+        XXH_writeLE64(customSecret + 16*i,     lo);
+        XXH_writeLE64(customSecret + 16*i + 8, hi);
+    }
+}
+
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * There is some input left inside the internal buffer.
+         * Fill it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* Consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* Some remaining input: buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc,
+                              state->secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX: short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t h128;
+        h128.low64  = XXH3_avalanche(mixedl);
+        h128.high64 = XXH3_avalanche(mixedh);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
+            h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+         return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         state->secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         state->secret + state->secretLimit + STRIPE_LEN
+                                                       - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH3_H_1397135465 */
diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c
new file mode 100644
index 0000000..0fae88c
--- /dev/null
+++ b/src/xxhash/xxhash.c
@@ -0,0 +1,43 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h
new file mode 100644
index 0000000..67a5887
--- /dev/null
+++ b/src/xxhash/xxhash.h
@@ -0,0 +1,1965 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+/*!
+ * XXH_NAMESPACE, aka Namespace Emulation:
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * XXH32():
+ *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ *  The memory between input & input+length must be valid (allocated and read-accessible).
+ *  "seed" can be used to alter the result predictably.
+ *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*!
+ * XXH64():
+ * Returns the 64-bit hash of sequence of length @length stored at memory
+ * address @input.
+ * @seed can be used to alter the result predictably.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation of an XXH
+ * state, for example, on the stack or in a struct.
+ * Never **ever** access members directly.
+ */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * The XXH3 algorithm is still in development.
+ * The results it produces may still change in future versions.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * Avoid storing values in long-term storage until the algorithm is finalized.
+ *
+ * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
+ * everything remains fine, its current format will be "frozen" and become the
+ * final one.
+ *
+ * After which, return values of XXH3 and XXH128 will no longer change in
+ * future versions.
+ *
+ * XXH3's return values will be officially finalized upon reaching v0.8.0.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional
+ * collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid trivial sequences, such as repeating sequences and especially '\0',
+ * as this can cancel out itself.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly based on the default
+ * secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from the seed. Makes state larger.
+   * Design might change */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   /* note: there is some padding after due to alignment on 64 bytes */
+   const unsigned char* secret;
+};   /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever possible.
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with the default parameters.
+ * The result will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, and must outlive the hash streaming session, so
+ * be careful when using stack arrays.
+ * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         <0 if *h128_1  < *h128_2
+ *         =0 if *h128_1 == *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be found in xxhash.c.
+ *
+ * However, code inlining requires the implementation to be visible to the
+ * compiler, usually within the header.
+ *
+ * As a workaround, xxhash.c used to be included within xxhash.h. This caused
+ * some issues with some build systems, especially ones which treat .c files
+ * as source files.
+ *
+ * Therefore, the implementation is now directly integrated within xxhash.h.
+ * Another small advantage is that xxhash.c is no longer needed in /include.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!
+ * XXH_FORCE_MEMORY_ACCESS:
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow to select a different access method for improved
+ * performance.
+ * Method 0 (default):
+ *     Use `memcpy()`. Safe and portable.
+ * Method 1:
+ *     `__attribute__((packed))` statement. It depends on compiler extensions
+ *     and is therefore not portable.
+ *     This method is safe if your compiler supports it, and *generally* as
+ *     fast or faster than `memcpy`.
+ * Method 2:
+ *     Direct access via cast. This method doesn't depend on the compiler but
+ *     violates the C standard.
+ *     It can generate buggy code on targets which do not support unaligned
+ *     memory accesses.
+ *     But in some circumstances, it's the only known way to get the most
+ *     performance (ie GCC + ARMv6)
+ * Method 3:
+ *     Byteshift. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction.
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!
+ *XXH_ACCEPT_NULL_INPUT_POINTER:
+ * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+ * triggering a segfault.
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!
+ * XXH_FORCE_ALIGN_CHECK:
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means: check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * Set it to 0 when the input is guaranteed to be aligned or when alignment
+ * doesn't matter for performance.
+ *
+ * This option does not affect XXH3.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!
+ * XXH_NO_INLINE_HINTS:
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+/*!
+ * XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang.
+ */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*!
+ * Modify the local functions below should you wish to use some other memory
+ * routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free(void* p) { free(p); }
+
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS /* disable inlining hints */
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#elif defined(_MSC_VER)    /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) \
+    || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*
+ * DEBUGLEVEL is expected to be defined externally, typically via the compiler's
+ * command line options. The value must be a number.
+ */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/*!
+ * XXH_CPU_LITTLE_ENDIAN:
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Nonstandard, but well-defined behavior in practice.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \
+                               && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+     * loop (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize.
+     */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 do {                           \
+    h32 += (*ptr++) * PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1;      \
+} while (0)
+
+#define PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*!
+ * XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+ * performance gain on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+ * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+ * to unroll. The code becomes ridiculously large (the largest function in the
+ * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+ * also slightly faster because it fits into cache better and is more likely
+ * to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+ */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 do {                                   \
+    h64 ^= (*ptr++) * PRIME64_5;                           \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;                 \
+} while (0)
+
+#define PROCESS4_64 do {                                   \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1;      \
+    ptr += 4;                                              \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;     \
+} while (0)
+
+#define PROCESS8_64 do {                                   \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr += 8;                                              \
+    h64 ^= k1;                                             \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;     \
+} while (0)
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
-- 
cgit v1.2.3


From d13d625f7363449c3fdc041b0a22005b92c83229 Mon Sep 17 00:00:00 2001
From: RSDuck <RSDuck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 04:33:36 +0200
Subject: jit: make everything configurable

---
 src/ARM.cpp                            | 127 ++++++++++++++++++++++++++++-----
 src/ARM.h                              |   3 +
 src/ARMJIT.cpp                         |  21 ++++--
 src/ARMJIT.h                           |   2 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp     |  14 ++--
 src/ARMJIT_x64/ARMJIT_Compiler.h       |   2 +
 src/Config.cpp                         |   6 ++
 src/Config.h                           |   3 +
 src/NDS.cpp                            |  26 ++++++-
 src/frontend/qt_sdl/PlatformConfig.cpp |   1 +
 10 files changed, 171 insertions(+), 34 deletions(-)

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index baf8468..1cd4bb2 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -532,7 +532,7 @@ void ARMv5::Execute()
 
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -565,14 +565,8 @@ void ARMv5::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
-        
+        }
+ 
         // TODO optimize this shit!!!
         if (Halted)
         {
@@ -597,6 +591,58 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+void ARMv5::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(0))
+        {
+            Halted = 0;
+            if (NDS::IME[0] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM9Timestamp < NDS::ARM9Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(0, instrAddr))
+        {
+            NDS::ARM9Timestamp = NDS::ARM9Target;
+            printf("ARMv5 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            {
+                NDS::ARM9Timestamp = NDS::ARM9Target;
+            }
+            break;
+        }
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
+
 void ARMv4::Execute()
 {
     if (Halted)
@@ -620,7 +666,7 @@ void ARMv4::Execute()
 
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
-        /*if (CPSR & 0x20) // THUMB
+        if (CPSR & 0x20) // THUMB
         {
             // prefetch
             R[15] += 2;
@@ -648,13 +694,7 @@ void ARMv4::Execute()
             }
             else
                 AddCycles_C();
-        }*/
-
-        /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4)))
-            printf("aaarg ungempappter raum %x\n", R[15]);*/
-
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4));
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        }
 
         // TODO optimize this shit!!!
         if (Halted)
@@ -679,3 +719,56 @@ void ARMv4::Execute()
     if (Halted == 2)
         Halted = 0;
 }
+
+void ARMv4::ExecuteJIT()
+{
+    if (Halted)
+    {
+        if (Halted == 2)
+        {
+            Halted = 0;
+        }
+        else if (NDS::HaltInterrupted(1))
+        {
+            Halted = 0;
+            if (NDS::IME[1] & 0x1)
+                TriggerIRQ();
+        }
+        else
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            return;
+        }
+    }
+
+    while (NDS::ARM7Timestamp < NDS::ARM7Target)
+    {
+        u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
+        if (!ARMJIT::IsMapped(1, instrAddr))
+        {
+            NDS::ARM7Timestamp = NDS::ARM7Target;
+            printf("ARMv4 PC in non executable region %08X\n", R[15]);
+            return;
+        }
+        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr);
+        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        // TODO optimize this shit!!!
+        if (Halted)
+        {
+            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            {
+                NDS::ARM7Timestamp = NDS::ARM7Target;
+            }
+            break;
+        }
+
+        if (IRQ) TriggerIRQ();
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
+    }
+
+    if (Halted == 2)
+        Halted = 0;
+}
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index e0832e2..3b01ef3 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,6 +52,7 @@ public:
     }
 
     virtual void Execute() = 0;
+    virtual void ExecuteJIT() = 0;
 
     bool CheckCondition(u32 code)
     {
@@ -159,6 +160,7 @@ public:
     void DataAbort();
 
     void Execute();
+    void ExecuteJIT();
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -281,6 +283,7 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+    void ExecuteJIT();
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 47b425f..e8e6be0 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,8 @@
 
 #include <string.h>
 
+#include "Config.h"
+
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
 namespace ARMJIT
@@ -125,18 +127,21 @@ CompiledBlock CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
-    FetchedInstr instrs[12];
+	if (Config::JIT_MaxBlockSize < 1)
+		Config::JIT_MaxBlockSize = 1;
+	if (Config::JIT_MaxBlockSize > 32)
+		Config::JIT_MaxBlockSize = 32;
+
+    FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 r15Initial = cpu->R[15];
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
-    //printf("block %x %d\n", r15, thumb);
     do
     {
         r15 += thumb ? 2 : 4;
 
         instrs[i].Instr = nextInstr[0];
-        //printf("%x %x\n", instrs[i].Instr, r15);
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
 
         if (cpu->Num == 0)
@@ -166,16 +171,16 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
         i++;
-    } while(!instrs[i - 1].Info.Branches() && i < 10);
+    } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize);
 
     CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
 
-    InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block);
+    InsertBlock(cpu->Num, blockAddr, block);
 
 	return block;
 }
 
-void ResetBlocks()
+void InvalidateBlockCache()
 {
 	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
 	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
@@ -185,6 +190,8 @@ void ResetBlocks()
 	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
 	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
 	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+
+	compiler->Reset();
 }
 
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 45bb4ed..004256c 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -111,7 +111,7 @@ void DeInit();
 
 CompiledBlock CompileBlock(ARM* cpu);
 
-void ResetBlocks();
+void InvalidateBlockCache();
 
 }
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 2b7ccd2..fe23859 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -336,13 +336,15 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 };
 #undef F
 
+void Compiler::Reset()
+{
+    SetCodePtr((u8*)ResetStart);
+}
+
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
 {
     if (IsAlmostFull())
-    {
-        ResetBlocks();
-        SetCodePtr((u8*)ResetStart);
-    }
+        InvalidateBlockCache();
 
     CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
 
@@ -355,7 +357,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     bool mergedThumbBL = false;
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
     MOV(64, R(RCPU), ImmPtr(cpu));
 
@@ -469,7 +471,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     MOV(32, R(RAX), Imm32(ConstantCycles));
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8);
+    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
     return res;
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index e04f96a..cd58012 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -22,6 +22,8 @@ class Compiler : public Gen::X64CodeBlock
 public:
     Compiler();
 
+    void Reset();
+
     CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
diff --git a/src/Config.cpp b/src/Config.cpp
index 5745f34..5c0892a 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,6 +37,9 @@ char DSiBIOS7Path[1024];
 char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
+bool JIT_Enable = false;
+int JIT_MaxBlockSize = 12;
+
 ConfigEntry ConfigFile[] =
 {
     {"BIOS9Path", 1, BIOS9Path, 0, "", 1023},
@@ -48,6 +51,9 @@ ConfigEntry ConfigFile[] =
     {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023},
     {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023},
 
+    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
+    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+
     {"", -1, NULL, 0, NULL, 0}
 };
 
diff --git a/src/Config.h b/src/Config.h
index 3947598..9dda157 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -51,6 +51,9 @@ extern char DSiBIOS7Path[1024];
 extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
+extern bool JIT_Enable;
+extern int JIT_MaxBlockSize;
+
 }
 
 #endif // CONFIG_H
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 4073536..cb85d13 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -566,7 +566,7 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
-    ARMJIT::ResetBlocks();
+    ARMJIT::InvalidateBlockCache();
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -794,6 +794,11 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+    if (!file->Saving)
+    {
+        ARMJIT::InvalidateBlockCache();
+    }
+
     return true;
 }
 
@@ -884,6 +889,7 @@ void RunSystem(u64 timestamp)
     }
 }
 
+template <bool EnableJIT>
 u32 RunFrame()
 {
     FrameStartTimestamp = SysTimestamp;
@@ -917,7 +923,10 @@ u32 RunFrame()
         }
         else
         {
-            ARM9->Execute();
+            if (EnableJIT)
+                ARM9->ExecuteJIT();
+            else
+                ARM9->Execute();
         }
 
         RunTimers(0);
@@ -940,7 +949,10 @@ u32 RunFrame()
             }
             else
             {
-                ARM7->Execute();
+                if (EnableJIT)
+                    ARM7->ExecuteJIT();
+                else
+                    ARM7->Execute();
             }
 
             RunTimers(1);
@@ -970,6 +982,14 @@ u32 RunFrame()
     return GPU::TotalScanlines;
 }
 
+u32 RunFrame()
+{
+    if (Config::JIT_Enable)
+        return RunFrame<true>();
+    else
+        return RunFrame<false>();
+}
+
 void Reschedule(u64 target)
 {
     if (CurCPU == 0)
diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp
index 06128d7..bfb3f97 100644
--- a/src/frontend/qt_sdl/PlatformConfig.cpp
+++ b/src/frontend/qt_sdl/PlatformConfig.cpp
@@ -72,6 +72,7 @@ char MicWavPath[1024];
 
 char LastROMFolder[1024];
 
+bool EnableJIT;
 
 ConfigEntry PlatformConfigFile[] =
 {
-- 
cgit v1.2.3


From 86f2be7260f9a9b51efd7c795c28cdcfda775742 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jul 2019 19:24:00 +0200
Subject: jit: add compile option

---
 CMakeLists.txt                     | 36 ++++++++++++++++++++++
 src/ARM.cpp                        | 13 ++++----
 src/ARM.h                          |  6 ++++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++-----------------
 src/ARMJIT_x64/ARMJIT_Compiler.h   |  1 -
 src/CMakeLists.txt                 | 25 +++++++++-------
 src/CP15.cpp                       | 12 ++++++--
 src/Config.cpp                     |  4 +++
 src/Config.h                       |  2 ++
 src/NDS.cpp                        | 26 ++++++++++++++++
 src/dolphin/CodeBlock.h            |  3 --
 11 files changed, 136 insertions(+), 53 deletions(-)

(limited to 'src/Config.h')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 885f0dd..1e53c60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,42 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(CheckSymbolExists)
+function(detect_architecture symbol arch)
+    if (NOT DEFINED ARCHITECTURE)
+        set(CMAKE_REQUIRED_QUIET 1)
+        check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch})
+        unset(CMAKE_REQUIRED_QUIET)
+
+        # The output variable needs to be unique across invocations otherwise
+        # CMake's crazy scope rules will keep it defined
+        if (ARCHITECTURE_${arch})
+            set(ARCHITECTURE "${arch}" PARENT_SCOPE)
+            set(ARCHITECTURE_${arch} 1 PARENT_SCOPE)
+            add_definitions(-DARCHITECTURE_${arch}=1)
+        endif()
+    endif()
+endfunction()
+
+detect_architecture("__x86_64__" x86_64)
+detect_architecture("__i386__" x86)
+detect_architecture("__arm__" ARM)
+detect_architecture("__aarch64__" ARM64)
+
+if (ARCHITECTURE STREQUAL x86_64)
+	option(ENABLE_JIT "Enable x64 JIT recompiler" ON)
+endif()
+
+if (ENABLE_JIT)
+	add_definitions(-DJIT_ENABLED)
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL Release)
+	option(ENABLE_LTO "Enable link-time optimization" ON)
+else()
+	option(ENABLE_LTO "Enable link-time optimization" OFF)
+endif()
+
 if (CMAKE_BUILD_TYPE STREQUAL Debug)
 	add_compile_options(-Og)
 endif()
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 1cd4bb2..bfe1890 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -81,15 +81,8 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
-namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];}
-
 void ARM::Reset()
 {
-    FILE* blabla = fopen("fhhg", "w");
-    for (int i = 0; i < ARMInstrInfo::ak_Count; i++)
-        fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]);
-    fclose(blabla);
-
     Cycles = 0;
     Halted = 0;
 
@@ -591,6 +584,7 @@ void ARMv5::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv5::ExecuteJIT()
 {
     if (Halted)
@@ -642,6 +636,7 @@ void ARMv5::ExecuteJIT()
     if (Halted == 2)
         Halted = 0;
 }
+#endif
 
 void ARMv4::Execute()
 {
@@ -720,6 +715,7 @@ void ARMv4::Execute()
         Halted = 0;
 }
 
+#ifdef JIT_ENABLED
 void ARMv4::ExecuteJIT()
 {
     if (Halted)
@@ -771,4 +767,5 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
-}
\ No newline at end of file
+}
+#endif
\ No newline at end of file
diff --git a/src/ARM.h b/src/ARM.h
index 3b01ef3..c3e7f44 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -52,7 +52,9 @@ public:
     }
 
     virtual void Execute() = 0;
+#ifdef ENABLE_JIT
     virtual void ExecuteJIT() = 0;
+#endif
 
     bool CheckCondition(u32 code)
     {
@@ -160,7 +162,9 @@ public:
     void DataAbort();
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     // all code accesses are forced nonseq 32bit
     u32 CodeRead32(u32 addr, bool branch);
@@ -283,7 +287,9 @@ public:
     void JumpTo(u32 addr, bool restorecpsr = false);
 
     void Execute();
+#ifdef JIT_ENABLED
     void ExecuteJIT();
+#endif
 
     u16 CodeRead16(u32 addr)
     {
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fe23859..18cb27e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -4,7 +4,10 @@
 
 #include <assert.h>
 
+#include "../dolphin/CommonFuncs.h"
+
 #ifdef _WIN32
+#include <windows.h>
 #else
 #include <sys/mman.h>
 #include <unistd.h>
@@ -32,8 +35,6 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
-int instructionPopularityARM[ARMInstrInfo::ak_Count];
-
 /*
     We'll repurpose this .bss memory
 
@@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32];
 
 Compiler::Compiler()
 {
-#ifdef _WIN32
-#else
-    u64 pagesize = sysconf(_SC_PAGE_SIZE);
-#endif
-
-    u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize);
-    u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned;
-
-#ifdef _WIN32
-#else
-    mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
-#endif
-
-    region = pageAligned;
-    region_size = alignedSize;
-    total_region_size = region_size;
+    {
+    #ifdef _WIN32
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+
+        u64 pageSize = (u64)sysInfo.dwPageSize;
+    #else
+        u64 pageSize = sysconf(_SC_PAGE_SIZE);
+    #endif
+
+        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
+        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;
+
+    #ifdef _WIN32
+        DWORD dummy;
+        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
+    #else
+        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
+    #endif
+
+        region = pageAligned;
+        region_size = alignedSize;
+        total_region_size = region_size;
+    }
 
     ClearCodeSpace();
 
-    SetCodePtr(pageAligned);
-
-    memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM));
-
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
@@ -118,7 +123,7 @@ Compiler::Compiler()
         SetJumpTarget(und);
         MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)));
         RET();
-        }
+    }
     {
         // RSCRATCH  mode
         // ABI_PARAM2 reg n
@@ -163,7 +168,10 @@ Compiler::Compiler()
         RET();
     }
 
-    ResetStart = (void*)GetWritableCodePtr();
+    // move the region forward to prevent overwriting the generated functions
+    region_size -= GetWritableCodePtr() - region;
+    total_region_size = region_size;
+    region = GetWritableCodePtr();
 }
 
 void Compiler::LoadCPSR()
@@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
 
 void Compiler::Reset()
 {
-    SetCodePtr((u8*)ResetStart);
+    ClearCodeSpace();
 }
 
 CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
@@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
-        if (!Thumb)
-            instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL;
-
         if (comp == NULL || i == instrsCount - 1)
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index cd58012..0ce7d8d 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -132,7 +132,6 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
-    void* ResetStart;
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2][2];
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 75fa42c..bfc0ad9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -49,19 +49,22 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+)
 
-	ARMJIT.cpp
-	ARMJIT_x64/ARMJIT_Compiler.cpp
-	ARMJIT_x64/ARMJIT_ALU.cpp
-	ARMJIT_x64/ARMJIT_LoadStore.cpp
-	ARMJIT_x64/ARMJIT_Branch.cpp
+if (ENABLE_JIT)
+	target_sources(core PRIVATE
+		ARMJIT.cpp
+		ARMJIT_x64/ARMJIT_Compiler.cpp
+		ARMJIT_x64/ARMJIT_ALU.cpp
+		ARMJIT_x64/ARMJIT_LoadStore.cpp
+		ARMJIT_x64/ARMJIT_Branch.cpp
 
-	dolphin/CommonFuncs.cpp
-	dolphin/x64ABI.cpp
-	dolphin/x64CPUDetect.cpp
-	dolphin/x64Emitter.cpp
-	dolphin/MemoryUtil.cpp
-)
+		dolphin/CommonFuncs.cpp
+		dolphin/x64ABI.cpp
+		dolphin/x64CPUDetect.cpp
+		dolphin/x64Emitter.cpp
+	)
+endif()
 
 if (WIN32)
 	target_link_libraries(core ole32 comctl32 ws2_32 opengl32)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 3e1c08b..5b5f935 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -813,7 +813,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     {
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -835,7 +837,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     {
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -857,8 +861,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     {
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
         ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
         ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
@@ -880,8 +886,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     {
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL;
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL;
+#ifdef JIT_ENABLED
+        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+#endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
diff --git a/src/Config.cpp b/src/Config.cpp
index 5c0892a..33bab75 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -37,8 +37,10 @@ char DSiBIOS7Path[1024];
 char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+#endif
 
 ConfigEntry ConfigFile[] =
 {
@@ -51,8 +53,10 @@ ConfigEntry ConfigFile[] =
     {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023},
     {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023},
 
+#ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+#endif
 
     {"", -1, NULL, 0, NULL, 0}
 };
diff --git a/src/Config.h b/src/Config.h
index 9dda157..9296335 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -51,8 +51,10 @@ extern char DSiBIOS7Path[1024];
 extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
+#ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+#endif
 
 }
 
diff --git a/src/NDS.cpp b/src/NDS.cpp
index cb85d13..7636a07 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -169,7 +169,9 @@ bool Init()
     ARM9 = new ARMv5();
     ARM7 = new ARMv4();
 
+#ifdef JIT_ENABLED
     ARMJIT::Init();
+#endif
 
     DMAs[0] = new DMA(0, 0);
     DMAs[1] = new DMA(0, 1);
@@ -203,7 +205,9 @@ void DeInit()
     delete ARM9;
     delete ARM7;
 
+#ifdef JIT_ENABLED
     ARMJIT::DeInit();
+#endif
 
     for (int i = 0; i < 8; i++)
         delete DMAs[i];
@@ -566,7 +570,9 @@ void Reset()
     KeyCnt = 0;
     RCnt = 0;
 
+#ifdef JIT_ENABLED
     ARMJIT::InvalidateBlockCache();
+#endif
 
     NDSCart::Reset();
     GBACart::Reset();
@@ -794,10 +800,12 @@ bool DoSavestate(Savestate* file)
         GPU::SetPowerCnt(PowerControl9);
     }
 
+#ifdef JIT_ENABLED
     if (!file->Saving)
     {
         ARMJIT::InvalidateBlockCache();
     }
+#endif
 
     return true;
 }
@@ -923,9 +931,11 @@ u32 RunFrame()
         }
         else
         {
+#ifdef JIT_ENABLED
             if (EnableJIT)
                 ARM9->ExecuteJIT();
             else
+#endif
                 ARM9->Execute();
         }
 
@@ -949,9 +959,11 @@ u32 RunFrame()
             }
             else
             {
+#ifdef JIT_ENABLED
                 if (EnableJIT)
                     ARM7->ExecuteJIT();
                 else
+#endif
                     ARM7->Execute();
             }
 
@@ -984,9 +996,11 @@ u32 RunFrame()
 
 u32 RunFrame()
 {
+#ifdef JIT_ENABLED
     if (Config::JIT_Enable)
         return RunFrame<true>();
     else
+#endif
         return RunFrame<false>();
 }
 
@@ -1998,7 +2012,9 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2050,7 +2066,9 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2118,7 +2136,9 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(0, addr);
+#endif
 
     switch (addr & 0xFF000000)
     {
@@ -2414,7 +2434,9 @@ u32 ARM7Read32(u32 addr)
 
 void ARM7Write8(u32 addr, u8 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2475,7 +2497,9 @@ void ARM7Write8(u32 addr, u8 val)
 
 void ARM7Write16(u32 addr, u16 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate16(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
@@ -2546,7 +2570,9 @@ void ARM7Write16(u32 addr, u16 val)
 
 void ARM7Write32(u32 addr, u32 val)
 {
+#ifdef JIT_ENABLED
     ARMJIT::Invalidate32(1, addr);
+#endif
 
     switch (addr & 0xFF800000)
     {
diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h
index 31a8d93..e71cf6d 100644
--- a/src/dolphin/CodeBlock.h
+++ b/src/dolphin/CodeBlock.h
@@ -9,7 +9,6 @@
 
 #include "Assert.h"
 #include "../types.h"
-#include "MemoryUtil.h"
 
 namespace Common
 {
@@ -41,8 +40,6 @@ public:
   CodeBlock() = default;
   virtual ~CodeBlock()
   {
-    if (region)
-      FreeCodeSpace();
   }
   CodeBlock(const CodeBlock&) = delete;
   CodeBlock& operator=(const CodeBlock&) = delete;
-- 
cgit v1.2.3


From 40b88ab05aeb7e5c5216f29f4004fb5797db04b5 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Thu, 3 Oct 2019 01:10:59 +0200
Subject: new block cache and much more... - more reliable code invalidation
 detection - blocks aren't stopped at any branch, but are being followed if
 possible to get larger blocks - idle loop recognition - optimised literal
 loads, load/store cycle counting  and loads/stores from constant addresses

---
 src/ARM.cpp                         |  44 ++-
 src/ARM.h                           |  16 +-
 src/ARMInterpreter.h                |   9 +
 src/ARMJIT.cpp                      | 755 ++++++++++++++++++++++++++++++------
 src/ARMJIT.h                        | 141 ++-----
 src/ARMJIT_Internal.h               | 198 ++++++++++
 src/ARMJIT_RegisterCache.h          |  36 +-
 src/ARMJIT_x64/ARMJIT_ALU.cpp       |  16 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp    |  43 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  | 184 +++++++--
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  51 ++-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++----------------
 src/ARM_InstrInfo.cpp               |  47 ++-
 src/ARM_InstrInfo.h                 |  11 +-
 src/CP15.cpp                        |  12 +-
 src/Config.cpp                      |   2 +
 src/Config.h                        |   1 +
 src/NDS.cpp                         |  22 +-
 18 files changed, 1529 insertions(+), 688 deletions(-)
 create mode 100644 src/ARMJIT_Internal.h

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 7caef75..1e75301 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -623,21 +623,26 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM9Timestamp += Cycles;
+        Cycles = 0;
 
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
             {
                 NDS::ARM9Timestamp = NDS::ARM9Target;
             }
             break;
         }
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -753,23 +758,28 @@ void ARMv4::ExecuteJIT()
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
-        ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr);
-        Cycles += (block ? block : ARMJIT::CompileBlock(this))();
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        if (block)
+            Cycles += block();
+        else
+            ARMJIT::CompileBlock(this);
+
+        NDS::ARM7Timestamp += Cycles;
+        Cycles = 0;
 
         // TODO optimize this shit!!!
+        if (IRQ) TriggerIRQ();
         if (Halted)
         {
-            if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target)
+            bool idleLoop = Halted & 0x20;
+            Halted &= ~0x20;
+            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
             {
                 NDS::ARM7Timestamp = NDS::ARM7Target;
             }
             break;
         }
-
-        if (IRQ) TriggerIRQ();
-
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
     }
 
     if (Halted == 2)
@@ -779,6 +789,8 @@ void ARMv4::ExecuteJIT()
 
 void ARMv5::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         if ((R[15] - 2) & 0x2)
@@ -801,6 +813,8 @@ void ARMv5::FillPipeline()
 
 void ARMv4::FillPipeline()
 {
+    SetupCodeMem(R[15]);
+
     if (CPSR & 0x20)
     {
         NextInstr[0] = CodeRead16(R[15] - 2);
diff --git a/src/ARM.h b/src/ARM.h
index 811b2e0..b36120a 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -311,7 +311,7 @@ public:
     {
         *val = BusRead8(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead16(u32 addr, u32* val)
@@ -320,7 +320,7 @@ public:
 
         *val = BusRead16(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataRead32(u32 addr, u32* val)
@@ -329,7 +329,7 @@ public:
 
         *val = BusRead32(addr);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataRead32S(u32 addr, u32* val)
@@ -337,14 +337,14 @@ public:
         addr &= ~3;
 
         *val = BusRead32(addr);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
     void DataWrite8(u32 addr, u8 val)
     {
         BusWrite8(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite16(u32 addr, u16 val)
@@ -353,7 +353,7 @@ public:
 
         BusWrite16(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][0];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][0];
     }
 
     void DataWrite32(u32 addr, u32 val)
@@ -362,7 +362,7 @@ public:
 
         BusWrite32(addr, val);
         DataRegion = addr >> 24;
-        DataCycles = NDS::ARM7MemTimings[DataRegion][2];
+        DataCycles = NDS::ARM7MemTimings[addr >> 15][2];
     }
 
     void DataWrite32S(u32 addr, u32 val)
@@ -370,7 +370,7 @@ public:
         addr &= ~3;
 
         BusWrite32(addr, val);
-        DataCycles += NDS::ARM7MemTimings[DataRegion][3];
+        DataCycles += NDS::ARM7MemTimings[addr >> 15][3];
     }
 
 
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index 7244238..2bf8167 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -28,6 +28,15 @@ namespace ARMInterpreter
 extern void (*ARMInstrTable[4096])(ARM* cpu);
 extern void (*THUMBInstrTable[1024])(ARM* cpu);
 
+void A_MSR_IMM(ARM* cpu);
+void A_MSR_REG(ARM* cpu);
+void A_MRS(ARM* cpu);
+void A_MCR(ARM* cpu);
+void A_MRC(ARM* cpu);
+void A_SVC(ARM* cpu);
+
+void T_SVC(ARM* cpu);
+
 void A_BLX_IMM(ARM* cpu); // I'm a special one look at me
 
 }
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 85cadf3..686bdd6 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,122 +1,137 @@
 #include "ARMJIT.h"
 
 #include <string.h>
+#include <assert.h>
 
 #include "Config.h"
 
+#include "ARMJIT_Internal.h"
 #include "ARMJIT_x64/ARMJIT_Compiler.h"
 
+#include "ARMInterpreter_ALU.h"
+#include "ARMInterpreter_LoadStore.h"
+#include "ARMInterpreter_Branch.h"
+#include "ARMInterpreter.h"
+
+#include "GPU3D.h"
+#include "SPU.h"
+#include "Wifi.h"
+
 namespace ARMJIT
 {
 
+#define JIT_DEBUGPRINT(msg, ...)
+
 Compiler* compiler;
-BlockCache cache;
 
-#define DUP2(x) x, x
+const u32 ExeMemRegionSizes[] = {
+	0x8000,			// Unmapped Region (dummy)
+	0x8000, 		// ITCM
+	4*1024*1024, 	// Main RAM
+	0x8000, 		// SWRAM
+	0xA4000, 		// LCDC
+	0x8000, 		// ARM9 BIOS
+	0x4000, 		// ARM7 BIOS
+	0x10000,		// ARM7 WRAM
+	0x40000			// ARM7 WVRAM
+};
 
-static ptrdiff_t JIT_MEM[2][32] = {
-	//arm9
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)),
-		/* 1X*/	DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	DUP2(offsetof(BlockCache, SWRAM)),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/		 -1, 
-					 offsetof(BlockCache, ARM9_LCDC),   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(offsetof(BlockCache, ARM9_BIOS))
-	},
-	//arm7
-	{
-		/* 0X*/	DUP2(offsetof(BlockCache, ARM7_BIOS)),
-		/* 1X*/	DUP2(-1),
-		/* 2X*/	DUP2(offsetof(BlockCache, MainRAM)),
-		/* 3X*/	     offsetof(BlockCache, SWRAM),
-		             offsetof(BlockCache, ARM7_WRAM),
-		/* 4X*/	DUP2(-1),
-		/* 5X*/	DUP2(-1),
-		/* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, 
-														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
-		/* 7X*/	DUP2(-1),
-		/* 8X*/	DUP2(-1),
-		/* 9X*/	DUP2(-1),
-		/* AX*/	DUP2(-1),
-		/* BX*/	DUP2(-1),
-		/* CX*/	DUP2(-1),
-		/* DX*/	DUP2(-1),
-		/* EX*/	DUP2(-1),
-		/* FX*/	DUP2(-1)
-		}
+const u32 ExeMemRegionOffsets[] = {
+	0,
+	0x8000,
+	0x10000,
+	0x410000,
+	0x418000,
+	0x4BC000,
+	0x4C4000,
+	0x4C8000,
+	0x4D8000,
+	0x518000,
 };
 
-static u32 JIT_MASK[2][32] = {
+#define DUP2(x) x, x
+
+const static ExeMemKind JIT_MEM[2][32] = {
 	//arm9
 	{
-		/* 0X*/	DUP2(0x00007FFF),
-		/* 1X*/	DUP2(0x00007FFF),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	DUP2(0x00007FFF),
-		/* 4X*/	DUP2(0x00000000),
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/		 0x00000000,
-					 0x000FFFFF,
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00007FFF)
+		/* 0X*/	DUP2(exeMem_ITCM),
+		/* 1X*/	DUP2(exeMem_ITCM), // mirror
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	DUP2(exeMem_SWRAM),
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/		 exeMem_Unmapped, 
+					 exeMem_LCDC,   // Plain ARM9-CPU Access (LCDC mode) (max 656KB)
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_ARM9_BIOS)
 	},
 	//arm7
 	{
-		/* 0X*/	DUP2(0x00003FFF),
-		/* 1X*/	DUP2(0x00000000),
-		/* 2X*/	DUP2(0x003FFFFF),
-		/* 3X*/	     0x00007FFF,
-		             0x0000FFFF,
-		/* 4X*/	     0x00000000,
-		             0x0000FFFF,
-		/* 5X*/	DUP2(0x00000000),
-		/* 6X*/ DUP2(0x0003FFFF),
-		/* 7X*/	DUP2(0x00000000),
-		/* 8X*/	DUP2(0x00000000),
-		/* 9X*/	DUP2(0x00000000),
-		/* AX*/	DUP2(0x00000000),
-		/* BX*/	DUP2(0x00000000),
-		/* CX*/	DUP2(0x00000000),
-		/* DX*/	DUP2(0x00000000),
-		/* EX*/	DUP2(0x00000000),
-		/* FX*/	DUP2(0x00000000)
+		/* 0X*/	DUP2(exeMem_ARM7_BIOS),
+		/* 1X*/	DUP2(exeMem_Unmapped),
+		/* 2X*/	DUP2(exeMem_MainRAM),
+		/* 3X*/	     exeMem_SWRAM,
+		             exeMem_ARM7_WRAM,
+		/* 4X*/	DUP2(exeMem_Unmapped),
+		/* 5X*/	DUP2(exeMem_Unmapped),
+		/* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, 
+														DeSmuME doesn't mirror the 64 MB region at 0x6800000 */
+		/* 7X*/	DUP2(exeMem_Unmapped),
+		/* 8X*/	DUP2(exeMem_Unmapped),
+		/* 9X*/	DUP2(exeMem_Unmapped),
+		/* AX*/	DUP2(exeMem_Unmapped),
+		/* BX*/	DUP2(exeMem_Unmapped),
+		/* CX*/	DUP2(exeMem_Unmapped),
+		/* DX*/	DUP2(exeMem_Unmapped),
+		/* EX*/	DUP2(exeMem_Unmapped),
+		/* FX*/	DUP2(exeMem_Unmapped)
 		}
 };
 
 #undef DUP2
 
+/*
+	translates address to pseudo physical address
+		- more compact, eliminates mirroring, everything comes in a row
+		- we only need one translation table
+*/
+u32 AddrTranslate9[0x2000];
+u32 AddrTranslate7[0x4000];
 
-void Init()
+JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
+AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+TinyVector<JitBlock*> JitBlocks;
+JitBlock* RestoreCandidates[0x1000] = {NULL};
+
+u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
 {
-    memset(&cache, 0, sizeof(BlockCache));
+	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
+}
 
+void Init()
+{
 	for (int i = 0; i < 0x2000; i++)
-		cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8])
-			+ (((i << 15) & JIT_MASK[0][i >> 8]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[0][i >> 8];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1));
+	}
 	for (int i = 0; i < 0x4000; i++)
-		cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL :
-			(CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9])
-			+ (((i << 14) & JIT_MASK[1][i >> 9]) >> 1);
+	{
+		ExeMemKind kind = JIT_MEM[1][i >> 9];
+		u32 size = ExeMemRegionSizes[kind];
+
+		AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1));
+	}
 
 	compiler = new Compiler();
 }
@@ -126,7 +141,7 @@ void DeInit()
 	delete compiler;
 }
 
-void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
+void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 {
 	for (int j = start; j >= 0; j--)
 	{
@@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-CompiledBlock CompileBlock(ARM* cpu)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+{
+	if (thumb)
+	{
+		u32 r15 = instr.Addr + 4;
+		cond = 0xE;
+
+		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
+		{
+			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
+    		targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_B)
+		{
+			s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20;
+			targetAddr = r15 + offset;
+			return true;
+		}
+		else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND)
+		{
+			cond = (instr.Instr >> 8) & 0xF;
+			s32 offset = (s32)(instr.Instr << 24) >> 23;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	else
+	{
+		cond = instr.Cond();
+		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
+			|| instr.Info.Kind == ARMInstrInfo::ak_B)
+		{
+			s32 offset = (s32)(instr.Instr << 8) >> 6;
+			u32 r15 = instr.Addr + 8;
+			targetAddr = r15 + offset;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
+{
+	// see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678
+	// it basically checks if one iteration of a loop depends on another
+	// the rules are quite simple
+
+	u16 regsWrittenTo = 0;
+	u16 regsDisallowedToWrite = 0;
+	for (int i = 0; i < instrsCount; i++)
+	{
+		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
+			return false;
+		if (i < instrsCount - 1 && instrs[i].Info.Branches())
+			return false;
+
+		u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15);
+		u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15);
+
+		regsDisallowedToWrite |= srcRegs & ~regsWrittenTo;
+		
+		if (dstRegs & regsDisallowedToWrite)
+			return false;
+		regsWrittenTo |= dstRegs;
+	}
+	return true;
+}
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+
+#define F(x) &ARMInterpreter::A_##x
+#define F_ALU(name, s) \
+	F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \
+	F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s)
+#define F_MEM_WB(name) \
+	F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \
+	F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM)
+#define F_MEM_HD(name) \
+	F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM)
+InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] =
+{
+	F_ALU(AND,), F_ALU(AND,_S),
+	F_ALU(EOR,), F_ALU(EOR,_S),
+	F_ALU(SUB,), F_ALU(SUB,_S),
+	F_ALU(RSB,), F_ALU(RSB,_S),
+	F_ALU(ADD,), F_ALU(ADD,_S),
+	F_ALU(ADC,), F_ALU(ADC,_S),
+	F_ALU(SBC,), F_ALU(SBC,_S),
+	F_ALU(RSC,), F_ALU(RSC,_S),
+	F_ALU(ORR,), F_ALU(ORR,_S),
+	F_ALU(MOV,), F_ALU(MOV,_S),
+	F_ALU(BIC,), F_ALU(BIC,_S),
+	F_ALU(MVN,), F_ALU(MVN,_S),
+	F_ALU(TST,),
+	F_ALU(TEQ,),
+	F_ALU(CMP,),
+	F_ALU(CMN,),
+
+	F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy),
+	F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB),
+
+	F_MEM_WB(STR),
+	F_MEM_WB(STRB),
+	F_MEM_WB(LDR),
+	F_MEM_WB(LDRB),
+
+	F_MEM_HD(STRH),
+	F_MEM_HD(LDRD),
+	F_MEM_HD(STRD),
+	F_MEM_HD(LDRH),
+	F_MEM_HD(LDRSB),
+	F_MEM_HD(LDRSH),
+
+	F(SWP), F(SWPB),
+	F(LDM), F(STM),
+
+	F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG),
+	F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC)
+};
+#undef F_ALU
+#undef F_MEM_WB
+#undef F_MEM_HD
+#undef F
+
+#define F(x) ARMInterpreter::T_##x
+InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
+{
+	F(LSL_IMM), F(LSR_IMM), F(ASR_IMM),
+	F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_),
+	F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM),
+	F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG),
+	F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG),
+	F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG),
+	F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG),
+	F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP),
+	F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG),
+	F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM),
+	F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL),
+	F(PUSH), F(POP), F(LDMIA), F(STMIA),
+	F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2),
+	F(UNK), F(SVC), 
+	NULL // BL_LONG psudo opcode
+};
+#undef F
+
+void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
 
@@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu)
 	if (Config::JIT_MaxBlockSize > 32)
 		Config::JIT_MaxBlockSize = 32;
 
+	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
+    if (!(cpu->Num == 0 
+        ? IsMapped<0>(blockAddr)
+        : IsMapped<1>(blockAddr)))
+    {
+        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
+    }
+	
+	u32 pseudoPhysicalAddr = cpu->Num == 0
+			? TranslateAddr<0>(blockAddr)
+			: TranslateAddr<1>(blockAddr);
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
-	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
     u32 r15 = cpu->R[15];
+
+	u32 addresseRanges[32] = {};
+	u32 numAddressRanges = 0;
+
 	cpu->FillPipeline();
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
+
+	JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", 
+		blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
+		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+		CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated);
+
+	u32 lastSegmentStart = blockAddr;
+
     do
     {
         r15 += thumb ? 2 : 4;
 
+		instrs[i].BranchFlags = 0;
 		instrs[i].SetFlags = 0;
         instrs[i].Instr = nextInstr[0];
         instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1];
@@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu)
 		instrs[i].Addr = nextInstrAddr[0];
 		nextInstrAddr[0] = nextInstrAddr[1];
 		nextInstrAddr[1] = r15;
+		JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr);
+
+		u32 translatedAddr = (cpu->Num == 0
+			? TranslateAddr<0>(instrs[i].Addr)
+			: TranslateAddr<1>(instrs[i].Addr)) & ~0xFF;
+		if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1])
+		{
+			bool returning = false;
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (addresseRanges[j] == translatedAddr)
+				{
+					returning = true;
+					break;
+				}
+			}
+			if (!returning)
+				addresseRanges[numAddressRanges++] = translatedAddr;
+		}
 
         if (cpu->Num == 0)
         {
@@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu)
         instrs[i].NextInstr[1] = nextInstr[1];
         instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr);
 
+		cpu->R[15] = r15;
+		cpu->CurInstr = instrs[i].Instr;
+		cpu->CodeCycles = instrs[i].CodeCycles;
+
+		if (thumb)
+		{
+			InterpretTHUMB[instrs[i].Info.Kind](cpu);
+		}
+		else
+		{
+			if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+			{
+				ARMInterpreter::A_BLX_IMM(cpu);
+			}
+			else
+			{
+                u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0);
+				assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM);
+				if (cpu->CheckCondition(instrs[i].Cond()))
+					InterpretARM[instrs[i].Info.Kind](cpu);
+				else
+					cpu->AddCycles_C();
+			}
+		}
+
+		instrs[i].DataCycles = cpu->DataCycles;
+		instrs[i].DataRegion = cpu->DataRegion;
+
 		if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0
 			&& instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1)
 		{
@@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu)
 			instrs[i - 1].Info.EndBlock = true;
 			i--;
 		}
-        i++;
 
+		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		{
+			bool hasBranched = cpu->R[15] != r15;
+
+			u32 cond, target;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
+
+			if (staticBranch)
+			{
+				bool isBackJump = false;
+				if (hasBranched)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						if (instrs[i].Addr == target)
+						{
+							isBackJump = true;
+							break;
+						}
+					}
+				}
+
+				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
+				{
+					// we might have an idle loop
+					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
+					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					{
+						instrs[i].BranchFlags |= branch_IdleBranch;
+						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
+					}
+				}
+				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				{
+					u32 targetPseudoPhysical = cpu->Num == 0
+						? TranslateAddr<0>(target)
+						: TranslateAddr<1>(target);
+					
+					r15 = target + (thumb ? 2 : 4);
+					assert(r15 == cpu->R[15]);
+
+					JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target);
+
+					nextInstr[0] = cpu->NextInstr[0];
+					nextInstr[1] = cpu->NextInstr[1];
+
+					nextInstrAddr[0] = target;
+					nextInstrAddr[1] = r15;
+
+					lastSegmentStart = target;
+
+					instrs[i].Info.EndBlock = false;
+
+					if (cond < 0xE)
+						instrs[i].BranchFlags |= branch_FollowCondTaken;
+				}
+			}
+
+			if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize)
+			{
+				instrs[i].Info.EndBlock = false;
+				instrs[i].BranchFlags |= branch_FollowCondNotTaken;
+			}
+		}
+
+        i++;
 
 		bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind);
-		if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile)
-			floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize);
+		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
+		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
+			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
 
-	floodFillSetFlags(instrs, i - 1, 0xF);
+	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
+	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	bool mayRestore = true;
+	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	{
+		RestoreCandidates[restoreSlot] = NULL;	
+		if (prevBlock->NumInstrs == i)
+		{
+			for (int j = 0; j < i; j++)
+			{
+				if (prevBlock->Instrs()[j] != instrs[j].Instr)
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
 
-    CompiledBlock block = compiler->CompileBlock(cpu, instrs, i);
+		if (prevBlock->NumAddresses == numAddressRanges)
+		{
+			for (int j = 0; j < numAddressRanges; j++)
+			{
+				if (prevBlock->AddressRanges()[j] != addresseRanges[j])
+				{
+					mayRestore = false;
+					break;
+				}
+			}
+		}
+		else
+			mayRestore = false;
+	}
+	else
+	{
+		mayRestore = false;
+		prevBlock = NULL;
+	}
 
-	if (cpu->Num == 0)
-    	InsertBlock<0>(blockAddr, block);
+	JitBlock* block;
+	if (!mayRestore)
+	{
+		if (prevBlock)
+			delete prevBlock;
+
+		block = new JitBlock(i, numAddressRanges);
+		for (int j = 0; j < i; j++)
+			block->Instrs()[j] = instrs[j].Instr;
+		for (int j = 0; j < numAddressRanges; j++)
+			block->AddressRanges()[j] = addresseRanges[j];
+
+		block->StartAddr = blockAddr;
+		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+
+		FloodFillSetFlags(instrs, i - 1, 0xF);
+
+		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+	}
 	else
-    	InsertBlock<1>(blockAddr, block);
+	{
+		JIT_DEBUGPRINT("restored! %p\n", prevBlock);
+		block = prevBlock;
+	}
+
+	for (int j = 0; j < numAddressRanges; j++)
+	{
+		assert(addresseRanges[j] == block->AddressRanges()[j]);
+		CodeRanges[addresseRanges[j] / 256].Blocks.Add(block);
+	}
+
+	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
 
-	return block;
+	JitBlocks.Add(block);
 }
 
-void InvalidateBlockCache()
+void InvalidateByAddr(u32 pseudoPhysical)
 {
-	printf("Resetting JIT block cache...\n");
+	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	AddressRange* range = &CodeRanges[pseudoPhysical / 256];
+	int startLength = range->Blocks.Length;
+	for (int i = 0; i < range->Blocks.Length; i++)
+	{
+		assert(range->Blocks.Length == startLength);
+		JitBlock* block = range->Blocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			if ((addr / 256) != (pseudoPhysical / 256))
+			{
+				AddressRange* otherRange = &CodeRanges[addr / 256];
+				assert(otherRange != range);
+				assert(otherRange->Blocks.RemoveByValue(block));
+			}
+		}
+
+		assert(JitBlocks.RemoveByValue(block));
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
 
-	memset(cache.MainRAM, 0, sizeof(cache.MainRAM));
-	memset(cache.SWRAM, 0, sizeof(cache.SWRAM));
-	memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS));
-	memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM));
-	memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC));
-	memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS));
-	memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM));
-	memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM));
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+
+		RestoreCandidates[slot] = block;
+	}
+	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
+		range->TimesInvalidated++;
+	
+	range->Blocks.Clear();
+}
+
+void InvalidateByAddr7(u32 addr)
+{
+	u32 pseudoPhysical = TranslateAddr<1>(addr);
+	if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false))
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateITCM(u32 addr)
+{
+	u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM];
+	if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0)
+		InvalidateByAddr(pseudoPhysical);
+}
+
+void InvalidateAll()
+{
+	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+
+		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			AddressRange* range = &CodeRanges[addr / 256];
+			range->Blocks.Clear();
+			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
+				range->TimesInvalidated++;
+		}
+
+		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
+		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
+			delete RestoreCandidates[slot];
+		
+		RestoreCandidates[slot] = block;
+	}
+
+	JitBlocks.Clear();
+}
+
+void ResetBlockCache()
+{
+	printf("Resetting JIT block cache...\n");
+	
+	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
+	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+	{
+		if (RestoreCandidates[i])
+		{
+			delete RestoreCandidates[i];
+			RestoreCandidates[i] = NULL;
+		}
+	}
+	for (int i = 0; i < JitBlocks.Length; i++)
+	{
+		JitBlock* block = JitBlocks[i];
+		for (int j = 0; j < block->NumAddresses; j++)
+		{
+			u32 addr = block->AddressRanges()[j];
+			CodeRanges[addr / 256].Blocks.Clear();
+			CodeRanges[addr / 256].TimesInvalidated = 0;
+		}
+		delete block;
+	}
+	JitBlocks.Clear();
 
 	compiler->Reset();
 }
 
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		if ((addr & 0xFF000000) == 0x04000000)
+		{
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;		
+			case 9: return (void*)NDS::ARM9IOWrite8;		
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size == 16)
+			{
+				if (store)
+					return (void*)Wifi::Write;
+				else
+					return (void*)Wifi::Read;
+			}
+			break;
+		}
+	}
+	return NULL;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 7e448ef..1db4d66 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,142 +9,67 @@
 namespace ARMJIT
 {
 
-typedef u32 (*CompiledBlock)();
-
-struct FetchedInstr
+enum ExeMemKind
 {
-    u32 A_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0xF;
-    }
-
-    u32 T_Reg(int pos) const
-    {
-        return (Instr >> pos) & 0x7;
-    }
-
-    u32 Cond() const
-    {
-        return Instr >> 28;
-    }
-
-	u8 SetFlags;
-    u32 Instr;
-    u32 NextInstr[2];
-	u32 Addr;
-
-    u8 CodeCycles;
-
-    ARMInstrInfo::Info Info;
+	exeMem_Unmapped = 0,
+	exeMem_ITCM,
+	exeMem_MainRAM,
+	exeMem_SWRAM,
+	exeMem_LCDC,
+	exeMem_ARM9_BIOS,
+	exeMem_ARM7_BIOS,
+	exeMem_ARM7_WRAM,
+	exeMem_ARM7_WVRAM,
+	exeMem_Count
 };
 
-/* 
-	Copied from DeSmuME
-	Some names where changed to match the nomenclature of melonDS
+extern const u32 ExeMemRegionOffsets[];
+extern const u32 ExeMemRegionSizes[];
 
-	Since it's nowhere explained and atleast I needed some time to get behind it,
-	here's a summary on how it works:
-		more or less all memory locations from which code can be executed are
-		represented by an array of function pointers, which point to null or
-		a function which executes a block instructions starting from there.
+typedef u32 (*JitBlockEntry)();
 
-		The most significant 4 bits of each address is ignored. This 28 bit space is
-		divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which 
-		a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
-		are the sizes of the smallest contigous memory region mapped to the respective CPU.
-		Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
-		we only need every second half word to be adressable.
+extern u32 AddrTranslate9[0x2000];
+extern u32 AddrTranslate7[0x4000];
 
-		In case a memory write hits mapped memory, the function block at this
-		address is set to null, so it's recompiled the next time it's executed.
-
-		This method has disadvantages, namely that only writing to the
-		first instruction of a block marks it as invalid and that memory remapping
-        (SWRAM and VRAM) isn't taken into account.
-*/
-
-struct BlockCache
-{
-    CompiledBlock* AddrMapping9[0x2000] = {0};
-    CompiledBlock* AddrMapping7[0x4000] = {0};
-
-    CompiledBlock MainRAM[4*1024*1024/2];
-	CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
-	CompiledBlock ARM9_ITCM[0x8000/2];
-	CompiledBlock ARM9_LCDC[0xA4000/2];
-	CompiledBlock ARM9_BIOS[0x8000/2];
-	CompiledBlock ARM7_BIOS[0x4000/2];
-	CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
-	CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
-};
-
-extern BlockCache cache;
+const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
+extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
 }
 
 template <u32 num>
-inline CompiledBlock LookUpBlock(u32 addr)
+inline u32 TranslateAddr(u32 addr)
 {
 	if (num == 0)
-		return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
+		return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
 	else
-		return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
+		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
 template <u32 num>
-inline void Invalidate16(u32 addr)
+inline JitBlockEntry LookUpBlock(u32 addr)
 {
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-			cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
-		else
-			cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
-	}
-}
-
-template <u32 num>
-inline void Invalidate32(u32 addr)
-{
-	if (IsMapped<num>(addr))
-	{
-		if (num == 0)
-		{
-			CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
-			page[(addr & 0x7FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x7FFF) >> 1] = NULL;
-		}
-		else
-		{
-			CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
-			page[(addr & 0x3FFF) >> 1] = NULL;
-			page[((addr + 2) & 0x3FFF) >> 1] = NULL;
-		}
-	}
-}
-
-template <u32 num>
-inline void InsertBlock(u32 addr, CompiledBlock func)
-{
-	if (num == 0)
-		cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
-	else
-		cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
+	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
 }
 
 void Init();
 void DeInit();
 
-CompiledBlock CompileBlock(ARM* cpu);
+void InvalidateByAddr(u32 pseudoPhysical);
+void InvalidateAll();
+
+void InvalidateITCM(u32 addr);
+void InvalidateByAddr7(u32 addr);
+
+void CompileBlock(ARM* cpu);
 
-void InvalidateBlockCache();
+void ResetBlockCache();
 
 }
 
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
new file mode 100644
index 0000000..4acb488
--- /dev/null
+++ b/src/ARMJIT_Internal.h
@@ -0,0 +1,198 @@
+#ifndef ARMJIT_INTERNAL_H
+#define ARMJIT_INTERNAL_H
+
+#include "types.h"
+#include <stdint.h>
+
+#include "ARMJIT.h"
+
+// here lands everything which doesn't fit into ARMJIT.h
+// where it would be included by pretty much everything
+namespace ARMJIT
+{
+
+enum
+{
+	branch_IdleBranch = 1 << 0,
+	branch_FollowCondTaken = 1 << 1,
+	branch_FollowCondNotTaken = 1 << 2
+};
+
+struct FetchedInstr
+{
+    u32 A_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0xF;
+    }
+
+    u32 T_Reg(int pos) const
+    {
+        return (Instr >> pos) & 0x7;
+    }
+
+    u32 Cond() const
+    {
+        return Instr >> 28;
+    }
+
+	u8 BranchFlags;
+	u8 SetFlags;
+    u32 Instr;
+    u32 NextInstr[2];
+	u32 Addr;
+
+    u8 CodeCycles;
+	u8 DataCycles;
+	u8 DataRegion;
+
+    ARMInstrInfo::Info Info;
+};
+
+/*
+	TinyVector
+		- because reinventing the wheel is the best!
+	
+	- meant to be used very often, with not so many elements
+	max 1 << 16 elements
+	- doesn't allocate while no elements are inserted
+	- not stl confirmant of course
+	- probably only works with POD types
+	- remove operations don't preserve order, but O(1)!
+*/
+template <typename T>
+struct __attribute__((packed)) TinyVector
+{
+	T* Data = NULL;
+	u16 Capacity = 0;
+	u32 Length = 0; // make it 32 bit so we don't need movzx
+
+	~TinyVector()
+	{
+		delete[] Data;
+	}
+
+	void MakeCapacity(u32 capacity)
+	{
+		assert(capacity <= UINT16_MAX);
+		assert(capacity > Capacity);
+		T* newMem = new T[capacity];
+		if (Data != NULL)
+			memcpy(newMem, Data, sizeof(Data) * Length);
+
+		T* oldData = Data;
+		Data = newMem;
+		if (oldData != NULL)
+			delete[] oldData;
+		
+		Capacity = capacity;
+	}
+
+	void Clear()
+	{
+		Length = 0;
+	}
+
+	void Add(T element)
+	{
+		assert(Length + 1 <= UINT16_MAX);
+		if (Length + 1 > Capacity)
+			MakeCapacity(((Capacity + 4) * 3) / 2);
+		
+		Data[Length++] = element;
+	}
+
+	void Remove(int index)
+	{
+		assert(index >= 0 && index < Length);
+
+		Length--;
+		Data[index] = Data[Length];
+		/*for (int i = index; i < Length; i++)
+			Data[i] = Data[i + 1];*/
+	}
+
+	int Find(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+				return i;
+		}
+		return -1;
+	}
+
+	bool RemoveByValue(T needle)
+	{
+		for (int i = 0; i < Length; i++)
+		{
+			if (Data[i] == needle)
+			{
+				Remove(i);
+				return true;
+			}
+		}
+		return false;
+	}
+
+	T& operator[](int index)
+	{
+		assert(index >= 0 && index < Length);
+		return Data[index];
+	}
+};
+
+class JitBlock
+{
+public:
+	JitBlock(u32 numInstrs, u32 numAddresses)
+	{
+		NumInstrs = numInstrs;
+		NumAddresses = numAddresses;
+		Data = new u32[numInstrs + numAddresses];
+	}
+
+	~JitBlock()
+	{
+		delete[] Data;
+	}
+
+	u32 StartAddr;
+	u32 PseudoPhysicalAddr;
+	
+	u32 NumInstrs;
+	u32 NumAddresses;
+
+	JitBlockEntry EntryPoint;
+
+	u32* Instrs()
+	{ return Data; }
+	u32* AddressRanges()
+	{ return Data + NumInstrs; }
+
+private:
+	/*
+		0..<NumInstrs - the instructions of the block
+		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
+			(atleast one, the pseudo physical address of the block)
+	*/
+	u32* Data;
+};
+
+// size should be 16 bytes because I'm to lazy to use mul and whatnot
+struct __attribute__((packed)) AddressRange
+{
+	TinyVector<JitBlock*> Blocks;
+	u16 TimesInvalidated;
+};
+
+extern AddressRange CodeRanges[ExeMemSpaceSize / 256];
+
+typedef void (*InterpreterFunc)(ARM* cpu);
+extern InterpreterFunc InterpretARM[];
+extern InterpreterFunc InterpretTHUMB[];
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index fe2f203..ed6a2b7 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -60,15 +60,46 @@ public:
         assert("Welp!");
     }
 
+    void PutLiteral(int reg, u32 val)
+    {
+        LiteralsLoaded |= (1 << reg);
+        LiteralValues[reg] = val;
+    }
+
+    void UnloadLiteral(int reg)
+    {
+        LiteralsLoaded &= ~(1 << reg);
+    }
+
+    bool IsLiteral(int reg)
+    {
+        return LiteralsLoaded & (1 << reg);
+    }
+
+    void PrepareExit()
+    {
+        BitSet16 dirtyRegs(DirtyRegs);
+        for (int reg : dirtyRegs)
+            Compiler->SaveReg(reg, Mapping[reg]);
+    }
+
     void Flush()
     {
         BitSet16 loadedSet(LoadedRegs);
         for (int reg : loadedSet)
             UnloadRegister(reg);
+        LiteralsLoaded = 0;
     }
 
 	void Prepare(bool thumb, int i)
     {
+        if (LoadedRegs & (1 << 15))
+            UnloadRegister(15);
+
+        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        for (int reg : invalidedLiterals)
+            UnloadLiteral(reg);
+
         u16 futureNeeded = 0;
         int ranking[16];
         for (int j = 0; j < 16; j++)
@@ -86,7 +117,7 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-		FetchedInstr Instr = Instrs[i];
+        FetchedInstr Instr = Instrs[i];
         u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
@@ -125,6 +156,9 @@ public:
 	static const int NativeRegsAvailable;
 
 	Reg Mapping[16];
+    u32 LiteralValues[16];
+
+    u16 LiteralsLoaded = 0;
 	u32 NativeRegsUsed = 0;
 	u16 LoadedRegs = 0;
 	u16 DirtyRegs = 0;
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index f868ddf..14c223b 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp()
         MOV(32, rd, op2);
 
     if (((CurInstr.Instr >> 21) & 0xF) == 0xF)
+    {
         NOT(32, rd);
+        if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32());
+    }
+    else if (op2.IsImm() && CurInstr.Cond() == 0xE)
+            RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32());
 
     if (S)
     {
@@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_()
     
     Comp_AddCycles_C();
 
-    if (op & 1)
+    // special case for thumb mov being alias to add rd, rn, #0
+    if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0)
+    {
+        if (rd != rs)
+            MOV(32, rd, rs);
+    }
+    else if (op & 1)
         Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV);
     else
         Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV);
@@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU()
     u32 op = (CurInstr.Instr >> 6) & 0xF;
 
     if ((op >= 0x2 && op < 0x4) || op == 0x7)
-        Comp_AddCycles_CI(1);
+        Comp_AddCycles_CI(1); // shift by reg
     else
         Comp_AddCycles_C();
 
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index cc7a3c4..0dedb3f 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -16,9 +16,6 @@ int squeezePointer(T* ptr)
 void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
 {
     // we can simplify constant branches by a lot
-    // it's not completely safe to assume stuff like, which instructions to preload
-    // we'll see how it works out
-
     IrregularCycles = true;
 
     u32 newPC;
@@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     {
         ARMv5* cpu9 = (ARMv5*)CurCPU;
 
-        u32 oldregion = R15 >> 24;
-        u32 newregion = addr >> 24;
-
         u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
         u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
         cpu9->RegionCodeCycles = regionCodeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
-
-        bool setupRegion = newregion != oldregion;
-        if (setupRegion)
-            cpu9->SetupCodeMem(addr);
+        if (Exit)
+            MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles));
 
         if (addr & 0x1)
         {
@@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
             cycles += cpu9->CodeCycles;
         }
 
-        MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem)));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask));
-
         cpu9->RegionCodeCycles = compileTimeCodeCycles;
-        if (setupRegion)
-            cpu9->SetupCodeMem(R15);
     }
     else
     {
@@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeRegion = codeRegion;
         cpu7->CodeCycles = codeCycles;
 
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
-        MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        if (Exit)
+        {
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion));
+            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles));
+        }
 
         if (addr & 0x1)
         {
@@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
         cpu7->CodeCycles = addr >> 15;
     }
 
-    MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
+    if (Exit)
+        MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC));
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
@@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
+    Comp_SpecialBranchBehaviour();
+
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
+
+    if (CurInstr.BranchFlags & branch_FollowCondTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+
     Comp_AddCycles_C(true);
-   SetJumpTarget(skipFailed);
+    SetJumpTarget(skipFailed);
 }
 
 void Compiler::T_Comp_B()
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d8ce1aa..25c55a3 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -72,12 +72,15 @@ Compiler::Compiler()
     for (int i = 0; i < 3; i++)
     {
         for (int j = 0; j < 2; j++)
-        {
             MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i);
-            MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i);
-            MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i);
-        }
     }
+    MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8;
+    MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8;
+    MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16;
+    MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16;
+    MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32;
+    MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32;
+
     for (int i = 0; i < 2; i++)
         for (int j = 0; j < 2; j++)
         {
@@ -179,12 +182,13 @@ void Compiler::LoadCPSR()
     MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
 }
 
-void Compiler::SaveCPSR()
+void Compiler::SaveCPSR(bool flagClean)
 {
     if (CPSRDirty)
     {
         MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
-        CPSRDirty = false;
+        if (flagClean)
+            CPSRDirty = false;
     }
 }
 
@@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg)
 // invalidates RSCRATCH and RSCRATCH3
 Gen::FixupBranch Compiler::CheckCondition(u32 cond)
 {
+    // hack, ldm/stm can get really big TODO: make this better
+    bool ldmStm = !Thumb &&
+        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
     if (cond >= 0x8)
     {
         static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
@@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond)
         SHL(32, R(RSCRATCH), R(RSCRATCH3));
         TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));
 
-        return J_CC(CC_Z);
+        return J_CC(CC_Z, ldmStm);
     }
     else
     {
         // could have used a LUT, but then where would be the fun?
         TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));
 
-        return J_CC(cond & 1 ? CC_NZ : CC_Z);
+        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
     }
 }
 
@@ -354,25 +361,34 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount)
+void Compiler::Comp_SpecialBranchBehaviour()
+{
+    if (CurInstr.BranchFlags & branch_IdleBranch)
+        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+
+    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    {
+        RegCache.PrepareExit();
+        SaveCPSR(false);
+        
+        MOV(32, R(RAX), Imm32(ConstantCycles));
+        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+        RET();
+    }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
-        InvalidateBlockCache();
+        ResetBlockCache();
 
     ConstantCycles = 0;
-    Thumb = cpu->CPSR & 0x20;
+    Thumb = thumb;
     Num = cpu->Num;
-    CodeRegion = cpu->CodeRegion;
+    CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
 
-    CompiledBlock res = (CompiledBlock)GetWritableCodePtr();
-
-    if (!(Num == 0 
-        ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) 
-        : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4))))
-    {
-        printf("Trying to compile a block in unmapped memory\n");
-    }
+    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
     ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
 
@@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
     LoadCPSR();
 
-    // TODO: this is ugly as a whole, do better
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
 
+        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
         CompileFunc comp = Thumb
             ? T_Comp[CurInstr.Info.Kind]
             : A_Comp[CurInstr.Info.Kind];
 
         bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
-        if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
         {
             MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
-            MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
-
             if (comp == NULL)
+            {
+                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
+                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));
+
                 SaveCPSR();
+            }
         }
-        
+
         if (comp != NULL)
             RegCache.Prepare(Thumb, i);
         else
@@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
 
         if (Thumb)
         {
-            u32 icode = (CurInstr.Instr >> 6) & 0x3FF;
             if (comp == NULL)
             {
                 MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]);
+                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
             }
             else
                 (this->*comp)();
@@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 }
             }
             else if (cond == 0xF)
+            {
                 Comp_AddCycles_C();
+            }
             else
             {
                 IrregularCycles = false;
@@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
                 if (cond < 0xE)
                     skipExecute = CheckCondition(cond);
 
-                u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0);
                 if (comp == NULL)
                 {
                     MOV(64, R(ABI_PARAM1), R(RCPU));
 
-                    ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]);
+                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
                 }
                 else
                     (this->*comp)();
 
+                Comp_SpecialBranchBehaviour();
+
                 if (CurInstr.Cond() < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipFailed = J();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C(true);
 
+                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
+                        {
+                            RegCache.PrepareExit();
+                            SaveCPSR(false);
+                            
+                            MOV(32, R(RAX), Imm32(ConstantCycles));
+                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
+                            RET();
+                        }
+
                         SetJumpTarget(skipFailed);
                     }
                     else
@@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs
     ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
     RET();
 
+    /*FILE* codeout = fopen("codeout", "a");
+    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
+    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);
+
+    fclose(codeout);*/
+
     return res;
 }
 
@@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     }
 }
 
+void Compiler::Comp_AddCycles_CDI()
+{
+    if (Num == 0)
+        Comp_AddCycles_CD();
+    else
+    {
+        IrregularCycles = true;
+
+        s32 cycles;
+
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02) // mainRAM
+        {
+            if (CodeRegion == 0x02)
+                cycles = numC + numD;
+            else
+            {
+                numC++;
+                cycles = std::max(numC + numD - 3, std::max(numC, numD));
+            }
+        }
+        else if (CodeRegion == 0x02)
+        {
+            numD++;
+            cycles = std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles = numC + numD + 1;
+        }
+        
+        printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles);
+
+        if (!Thumb && CurInstr.Cond() < 0xE)
+            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        else
+            ConstantCycles += cycles;
+    }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+    u32 cycles = 0;
+    if (Num == 0)
+    {
+        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+        s32 numD = CurInstr.DataCycles;
+
+        //if (DataRegion != CodeRegion)
+            cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+        IrregularCycles = cycles != numC;
+    }
+    else
+    {
+        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+        s32 numD = CurInstr.DataCycles;
+
+        if (CurInstr.DataRegion == 0x02)
+        {
+            if (CodeRegion == 0x02)
+                cycles += numC + numD;
+            else
+                cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else if (CodeRegion == 0x02)
+        {
+            cycles += std::max(numC + numD - 3, std::max(numC, numD));
+        }
+        else
+        {
+            cycles += numC + numD;
+        }
+
+        IrregularCycles = true;
+    }
+
+    if (!Thumb && CurInstr.Cond() < 0xE)
+        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+    else
+        ConstantCycles += cycles;
+}
+
 }
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index fcb2380..792ff66 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -4,6 +4,7 @@
 #include "../dolphin/x64Emitter.h"
 
 #include "../ARMJIT.h"
+#include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
 namespace ARMJIT
@@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX;
 const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 
+struct ComplexOperand
+{
+    ComplexOperand()
+    {}
+
+    ComplexOperand(u32 imm)
+        : IsImm(true), Imm(imm)
+    {}
+    ComplexOperand(int reg, int op, int amount)
+        : IsImm(false)
+    {
+        Reg.Reg = reg;
+        Reg.Op = op;
+        Reg.Amount = amount;
+    }
+
+    bool IsImm;
+    union
+    {
+        struct
+        {
+            int Reg, Op, Amount;
+        } Reg;
+        u32 Imm;
+    };
+};
 
 class Compiler : public Gen::XEmitter
 {
@@ -24,7 +51,7 @@ public:
 
     void Reset();
 
-    CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -39,6 +66,8 @@ public:
     void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 i);
     void Comp_AddCycles_CI(Gen::X64Reg i, int add);
+    void Comp_AddCycles_CDI();
+    void Comp_AddCycles_CD();
 
     enum
     {
@@ -92,8 +121,17 @@ public:
     void T_Comp_BL_LONG_2();
     void T_Comp_BL_Merged();
 
-    void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size);
+    enum
+    {
+        memop_Writeback = 1 << 0,
+        memop_Post = 1 << 1,
+        memop_SignExtend = 1 << 2,
+        memop_Store = 1 << 3,
+        memop_SubtractOffset = 1 << 4
+    };
+    void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags);
     s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+    void Comp_MemLoadLiteral(int size, int rd, u32 addr);
 
     void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), 
         Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags);
@@ -105,8 +143,9 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
+    void Comp_SpecialBranchBehaviour();
+
     void* Gen_MemoryRoutine9(bool store, int size);
-    void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size);
 
     void* Gen_MemoryRoutineSeq9(bool store, bool preinc);
     void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM);
@@ -117,10 +156,9 @@ public:
     Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed);
 
     Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed);
-    Gen::OpArg A_Comp_GetMemWBOffset();
 
     void LoadCPSR();
-    void SaveCPSR();
+    void SaveCPSR(bool flagClean = true);
 
     bool FlagsNZRequired()
     { return CurInstr.SetFlags & 0xC; }
@@ -139,10 +177,11 @@ public:
     u8* ResetStart;
     u32 CodeMemSize;
 
+    bool Exit;
     bool IrregularCycles;
 
     void* MemoryFuncs9[3][2];
-    void* MemoryFuncs7[3][2][2];
+    void* MemoryFuncs7[3][2];
 
     void* MemoryFuncsSeq9[2][2];
     void* MemoryFuncsSeq7[2][2][2];
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index bf8280d..13ca415 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -27,51 +27,7 @@ int squeezePointer(T* ptr)
 /*
     address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
     store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
-    code cycles - ABI_PARAM3
 */
-
-#define CALC_CYCLES_9(numC, numD, scratch) \
-    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
-    CMP(32, R(numC), R(numD)); \
-    CMOVcc(32, numD, R(numC), CC_G); \
-    CMP(32, R(numD), R(scratch)); \
-    CMOVcc(32, scratch, R(numD), CC_G); \
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
-#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        LEA(32, scratch, MRegSum(numD, numC)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        if (!store) \
-            ADD(32, R(numC), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
-    if (codeMainRAM) \
-    { \
-        if (!store) \
-            ADD(32, R(numD), Imm8(1)); \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
-        CMP(32, R(numD), R(numC)); \
-        CMOVcc(32, numC, R(numD), CC_G); \
-        CMP(32, R(numC), R(scratch)); \
-        CMOVcc(32, scratch, R(numC), CC_G); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    } \
-    else \
-    { \
-        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
-    }
-
 void* Compiler::Gen_MemoryRoutine9(bool store, int size)
 {
     u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
@@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
     FixupBranch insideITCM = J_CC(CC_B);
 
-    // cycle counting!
-    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
-    SHR(32, R(ABI_PARAM4), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
-    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-
     if (store)
     {
         if (size > 8)
@@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     }
 
     SetJumpTarget(insideDTCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
     if (store)
         MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
@@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     RET();
 
     SetJumpTarget(insideITCM);
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
     MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
     AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
     if (store)
     {
         MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
+        
+        // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code!
+        static_assert(sizeof(AddressRange) == 16);
+        LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(RSCRATCH), R(ABI_PARAM1));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        JMP((u8*)InvalidateByAddr, true);
+        SetJumpTarget(noCode);
     }
     else
     {
@@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size)
     return res;
 }
 
-void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
-{
-    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
-    AlignCode4();
-    void* res = GetWritableCodePtr();
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));
-
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
-    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
-    if (store)
-    {
-        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
-        XOR(32, R(RSCRATCH), R(RSCRATCH));
-        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
-        if (size == 32)
-            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
-    }
-    else
-    {
-        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
-        if (size == 32)
-        {
-            if (ABI_PARAM1 != ECX)
-                MOV(32, R(ECX), R(ABI_PARAM1));
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-        }
-    }
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
-    if (store)
-    {
-        if (size > 8)
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-        switch (size)
-        {
-        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
-        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
-        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-        {
-            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            ABI_CallFunction(NDS::ARM7Read32);
-            ABI_PopRegistersAndAdjustStack({ECX}, 8);
-            AND(32, R(ECX), Imm8(3));
-            SHL(32, R(ECX), Imm8(3));
-            ROR_(32, R(RSCRATCH), R(ECX));
-            RET();
-        }
-        else if (size == 16)
-        {
-            AND(32, R(ABI_PARAM1), Imm32(addressMask));
-            JMP((u8*)NDS::ARM7Read16, true);
-        }
-        else
-            JMP((u8*)NDS::ARM7Read8, true);
-    }
-
-    return res;
-}
-
 #define MEMORY_SEQ_WHILE_COND \
         if (!store) \
             MOV(32, currentElement, R(EAX));\
@@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
     ABI_PARAM1 address
     ABI_PARAM2 address where registers are stored
     ABI_PARAM3 how many values to read/write
-    ABI_PARAM4 code cycles
 
     Dolphin x64CodeEmitter is my favourite assembler
  */
 void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(12));
-    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));
-
-    FixupBranch finishIt1 = J();
+    RET();
 
     SetJumpTarget(insideDTCM);
     AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
@@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
-    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
-    FixupBranch finishIt2 = J();
+    RET();
 
     SetJumpTarget(insideITCM);
     MOV(32, R(RSCRATCH), R(ABI_PARAM1));
@@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
     {
         MOV(32, R(ABI_PARAM4), currentElement);
         MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
-        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
-        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
+
+        ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM]));
+        MOV(32, R(ABI_PARAM4), R(RSCRATCH));
+        SHR(32, R(RSCRATCH), Imm8(8));
+        SHL(32, R(RSCRATCH), Imm8(4));
+        CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0));
+        FixupBranch noCode = J_CC(CC_Z);
+        ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        MOV(32, R(ABI_PARAM1), R(ABI_PARAM4));
+        CALL((u8*)InvalidateByAddr);
+        ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
+        SetJumpTarget(noCode);
     }
     else
         MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), Imm32(1));
-    MOV(32, R(ABI_PARAM2), Imm32(1));
-
-    SetJumpTarget(finishIt1);
-    SetJumpTarget(finishIt2);
-
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
-
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
-
-    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
     RET();
 
     return res;
@@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
 
 void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
 {
-    const u8* zero = GetCodePtr();
-    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
-    RET();
-
     void* res = (void*)GetWritableCodePtr();
 
-    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
-    J_CC(CC_Z, zero);
-
-    PUSH(ABI_PARAM3);
-    PUSH(ABI_PARAM4); // we need you later
-
     const u8* repeat = GetCodePtr();
 
     if (preinc)
@@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
     ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
 
     MEMORY_SEQ_WHILE_COND
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    SHR(32, R(RSCRATCH), Imm8(15));
-    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
-    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));
+    RET();
 
-    POP(ABI_PARAM4);
-    POP(ABI_PARAM3);
+    return res;
+}
 
-    // TODO: optimise this
-    CMP(32, R(ABI_PARAM3), Imm8(1));
-    FixupBranch skipSequential = J_CC(CC_E);
-    SUB(32, R(ABI_PARAM3), Imm8(1));
-    IMUL(32, RSCRATCH, R(ABI_PARAM3));
-    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
-    SetJumpTarget(skipSequential);
+#undef MEMORY_SEQ_WHILE_COND
 
-    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
-    AND(32, R(RSCRATCH), Imm32(0xFF000000));
-    CMP(32, R(RSCRATCH), Imm32(0x02000000));
-    FixupBranch outsideMainRAM = J_CC(CC_NE);
-    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
+{
+    u32 val;
+    // make sure arm7 bios is accessible
+    u32 tmpR15 = CurCPU->R[15];
+    CurCPU->R[15] = R15;
+    if (size == 32)
+    {
+        CurCPU->DataRead32(addr & ~0x3, &val);
+        val = ROR(val, (addr & 0x3) << 3);
+    }
+    else if (size == 16)
+        CurCPU->DataRead16(addr & ~0x1, &val);
+    else
+        CurCPU->DataRead8(addr, &val);
+    CurCPU->R[15] = tmpR15;
 
-    SetJumpTarget(outsideMainRAM);
-    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
-    RET();
+    MOV(32, MapReg(rd), Imm32(val));
 
-    return res;
+    if (Thumb || CurInstr.Cond() == 0xE)
+        RegCache.PutLiteral(rd, val);
+
+    Comp_AddCycles_CDI();
 }
 
-#undef CALC_CYCLES_9
-#undef MEMORY_SEQ_WHILE_COND
+void fault(u32 a, u32 b)
+{
+    printf("actually not static! %x %x\n", a, b);
+}
 
-void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
+void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags)
 {
-    IrregularCycles = true;
+    if (flags & memop_Store)
+    {
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-    if (store)
-        MOV(32, R(ABI_PARAM2), rd);
-    u32 cycles = Num
-        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-    MOV(32, R(ABI_PARAM3), Imm32(cycles));
-    CALL(Num == 0
-        ? MemoryFuncs9[size >> 4][store]
-        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);
+    u32 addressMask = ~0;
+    if (size == 32)
+        addressMask = ~3;
+    if (size == 16)
+        addressMask = ~1;
 
-    if (!store)
+    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
     {
-        if (signExtend)
-            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        Comp_MemLoadLiteral(size, rd, 
+            R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1));
+    }
+    else
+    {
+        OpArg rdMapped = MapReg(rd);
+        OpArg rnMapped = MapReg(rn);
+
+        bool inlinePreparation = Num == 1;
+        u32 constLocalROR32 = 4;
+
+        void* memoryFunc = Num == 0
+            ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
+            : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
+
+        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        {
+            u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+            /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr));
+            MOV(32, R(ABI_PARAM1), Imm32(R15));
+            MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+            CMP(32, R(RSCRATCH), Imm32(addr));
+            FixupBranch eq = J_CC(CC_E);
+            CALL((void*)fault);
+            SetJumpTarget(eq);*/
+
+            NDS::MemRegion region;
+            region.Mem = NULL;
+            if (Num == 0)
+            {
+                ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+                // stupid dtcm...
+                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+                {
+                    region.Mem = cpu5->DTCM;
+                    region.Mask = 0x3FFF;
+                }
+                else
+                {
+                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+                }
+            }
+            else
+                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+            if (region.Mem != NULL)
+            {
+                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+                if (flags & memop_Store)
+                {
+                    MOV(size, M(ptr), MapReg(rd));
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+                    else
+                        MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr));
+
+                    if (size == 32 && addr & ~0x3)
+                    {
+                        ROR_(32, rdMapped, Imm8((addr & 0x3) << 3));
+                    }
+                }
+
+                return;
+            }
+
+            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+            if (specialFunc)
+            {
+                memoryFunc = specialFunc;
+                inlinePreparation = true;
+                constLocalROR32 = addr & 0x3;
+            }
+        }
+
+        X64Reg finalAddr = ABI_PARAM1;
+        if (flags & memop_Post)
+        {
+            MOV(32, R(ABI_PARAM1), rnMapped);
+
+            finalAddr = rnMapped.GetSimpleReg();
+        }
+
+        if (op2.IsImm)
+        {
+            MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+        }
         else
-            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
+        {
+            OpArg rm = MapReg(op2.Reg.Reg);
+
+            if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+                && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+            {
+                LEA(32, finalAddr, 
+                    MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+            }
+            else
+            {
+                bool throwAway;
+                OpArg offset =
+                    Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+                
+                if (flags & memop_SubtractOffset)
+                {
+                    MOV(32, R(finalAddr), rnMapped);
+                    if (!offset.IsZero())
+                        SUB(32, R(finalAddr), offset);
+                }
+                else
+                    MOV_sum(32, finalAddr, rnMapped, offset);
+            }
+        }
+
+        if ((flags & memop_Writeback) && !(flags & memop_Post))
+            MOV(32, rnMapped, R(finalAddr));
+
+        if (flags & memop_Store)
+            MOV(32, R(ABI_PARAM2), rdMapped);
+
+        if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32)
+            MOV(32, rdMapped, R(ABI_PARAM1));
+
+        if (inlinePreparation && size > 8)
+            AND(32, R(ABI_PARAM1), Imm8(addressMask));
+
+        CALL(memoryFunc);
+
+        if (!(flags & memop_Store))
+        {
+            if (inlinePreparation && size == 32)
+            {
+                if (constLocalROR32 == 4)
+                {
+                    static_assert(RSCRATCH3 == ECX);
+                    MOV(32, R(ECX), rdMapped);
+                    AND(32, R(ECX), Imm8(3));
+                    SHL(32, R(ECX), Imm8(3));
+                    ROR_(32, R(RSCRATCH), R(ECX));
+                }
+                else if (constLocalROR32 != 0)
+                    ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3));
+            }
+
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+
+        if (!(flags & memop_Store) && rd == 15)
+        {
+            if (size < 32)
+                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            {
+                if (Num == 1)
+                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
+                Comp_JumpTo(rdMapped.GetSimpleReg());
+            }
+        }
     }
 }
 
@@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    u32 cycles = Num
-            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
-            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
-
     // we need to make sure that the stack stays aligned to 16 bytes
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 
-    MOV(32, R(ABI_PARAM4), Imm32(cycles));
     if (!store)
     {
+        Comp_AddCycles_CDI();
+
         MOV(32, R(ABI_PARAM3), Imm32(regsCount));
         SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
         MOV(64, R(ABI_PARAM2), R(RSP));
@@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
     {
+        Comp_AddCycles_CD();
+
         if (regsCount & 1)
             PUSH(RSCRATCH);
 
@@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     return offset;
 }
 
-OpArg Compiler::A_Comp_GetMemWBOffset()
-{
-    if (!(CurInstr.Instr & (1 << 25)))
-    {
-        u32 imm = CurInstr.Instr & 0xFFF;
-        return Imm32(imm);
-    }
-    else
-    {
-        int op = (CurInstr.Instr >> 5) & 0x3;
-        int amount = (CurInstr.Instr >> 7) & 0x1F;
-        OpArg rm = MapReg(CurInstr.A_Reg(0));
-        bool carryUsed;
-
-        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
-    }
-}
 
 void Compiler::A_Comp_MemWB()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
     bool load = CurInstr.Instr & (1 << 20);
     bool byte = CurInstr.Instr & (1 << 22);
     int size = byte ? 8 : 32;
+    
+    int flags = 0;
+    if (!load)
+        flags |= memop_Store;
+    if (!(CurInstr.Instr & (1 << 24)))
+        flags |= memop_Post;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
 
-    if (CurInstr.Instr & (1 << 24))
+    ComplexOperand offset;
+    if (!(CurInstr.Instr & (1 << 25)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
+        offset = ComplexOperand(CurInstr.Instr & 0xFFF);
     }
     else
-        MOV(32, R(ABI_PARAM1), rn);
-
-    if (!(CurInstr.Instr & (1 << 24)))
     {
-        OpArg offset = A_Comp_GetMemWBOffset();
+        int op = (CurInstr.Instr >> 5) & 0x3;
+        int amount = (CurInstr.Instr >> 7) & 0x1F;
+        int rm = CurInstr.A_Reg(0);
 
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
+        offset = ComplexOperand(rm, op, amount);
     }
 
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
-    if (load && CurInstr.A_Reg(12) == 15)
-    {
-        if (byte)
-            printf("!!! LDRB PC %08X\n", R15);
-        else
-        {
-            if (Num == 1)
-                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
-            Comp_JumpTo(rd.GetSimpleReg());
-        }
-    }
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::A_Comp_MemHalf()
 {
-    OpArg rn = MapReg(CurInstr.A_Reg(16));
-    OpArg rd = MapReg(CurInstr.A_Reg(12));
-
-    OpArg offset = CurInstr.Instr & (1 << 22)
-        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
-        : MapReg(CurInstr.A_Reg(0));
+    ComplexOperand offset = CurInstr.Instr & (1 << 22)
+        ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
+        : ComplexOperand(CurInstr.A_Reg(0), 0, 0);
 
     int op = (CurInstr.Instr >> 5) & 0x3;
     bool load = CurInstr.Instr & (1 << 20);
@@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf()
     if (size == 32 && Num == 1)
         return; // NOP
 
-    if (CurInstr.Instr & (1 << 24))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            MOV_sum(32, ABI_PARAM1, rn, offset);
-        else
-        {
-            MOV(32, R(ABI_PARAM1), rn);
-            SUB(32, R(ABI_PARAM1), offset);
-        }
-        
-        if (CurInstr.Instr & (1 << 21))
-            MOV(32, rn, R(ABI_PARAM1));
-    }
-    else
-        MOV(32, R(ABI_PARAM1), rn);
-
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
     if (!(CurInstr.Instr & (1 << 24)))
-    {
-        if (CurInstr.Instr & (1 << 23))
-            ADD(32, rn, offset);
-        else
-            SUB(32, rn, offset);
-    }
+        flags |= memop_Post;
+    if (!(CurInstr.Instr & (1 << 23)))
+        flags |= memop_SubtractOffset;
+    if (CurInstr.Instr & (1 << 21))
+        flags |= memop_Writeback;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
-
-    if (load && CurInstr.A_Reg(12) == 15)
-        printf("!!! MemHalf op PC %08X\n", R15);;
+    Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
 }
 
 void Compiler::T_Comp_MemReg()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op & 0x2;
     bool byte = op & 0x1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), 
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::A_Comp_LDM_STM()
@@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM()
 
 void Compiler::T_Comp_MemImm()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     int op = (CurInstr.Instr >> 11) & 0x3;
     bool load = op & 0x1;
     bool byte = op & 0x2;
     u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset),
+        byte ? 8 : 32, load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_MemRegHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-    OpArg ro = MapReg(CurInstr.T_Reg(6));
-
     int op = (CurInstr.Instr >> 10) & 0x3;
     bool load = op != 0;
     int size = op != 1 ? 16 : 8;
     bool signExtend = op & 1;
 
-    MOV_sum(32, ABI_PARAM1, rb, ro);
+    int flags = 0;
+    if (signExtend)
+        flags |= memop_SignExtend;
+    if (!load)
+        flags |= memop_Store;
 
-    Comp_MemAccess(rd, signExtend, !load, size);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0),
+        size, flags);
 }
 
 void Compiler::T_Comp_MemImmHalf()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(0));
-    OpArg rb = MapReg(CurInstr.T_Reg(3));
-
     u32 offset = (CurInstr.Instr >> 5) & 0x3E;
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 16);
+    Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
 
-    // hopefully this doesn't break
-    u32 val; CurCPU->DataRead32(addr, &val);
-    MOV(32, rd, Imm32(val));
+    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
 }
 
 void Compiler::T_Comp_MemSPRel()
 {
     u32 offset = (CurInstr.Instr & 0xFF) * 4;
-    OpArg rd = MapReg(CurInstr.T_Reg(8));
     bool load = CurInstr.Instr & (1 << 11);
 
-    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));
-
-    Comp_MemAccess(rd, false, !load, 32);
+    Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32,
+        load ? 0 : memop_Store);
 }
 
 void Compiler::T_Comp_PUSH_POP()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 9239e29..0fbde26 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -36,7 +36,7 @@ enum {
     A_StaticShiftSetC   = 1 << 18,
     A_SetC              = 1 << 19,
 
-    A_WriteMemory       = 1 << 20,
+    A_WriteMem          = 1 << 20
 };
 
 #define A_BIOP A_Read16
@@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(
 const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL);
 const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL);
 const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL);
-const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy);
+const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy);
 const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy);
 const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy);
 const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy);
@@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD);
 const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB);
 
 #define A_LDR A_Write12
-#define A_STR A_Read12 | A_WriteMemory
+#define A_STR A_Read12 | A_WriteMem
 
 #define A_IMPLEMENT_WB_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR)
 A_IMPLEMENT_WB_LDRSTR(LDRB,LDR)
 
 #define A_LDRD A_Write12Double
-#define A_STRD A_Read12Double | A_WriteMemory
+#define A_STRD A_Read12Double | A_WriteMem
 
 #define A_IMPLEMENT_HD_LDRSTR(x,k) \
     const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \
@@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR)
 A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR)
 
-const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP);
-const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMemory | ak(ak_SWPB);
+const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP);
+const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0  | A_WriteMem | ak(ak_SWPB);
 
 const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM);
-const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM);
+const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM);
 
 const u32 A_B = A_BranchAlways | ak(ak_B);
 const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL);
@@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC);
 
 // THUMB
 
-#define tk(x) ((x) << 21)
+#define tk(x) ((x) << 22)
 
 enum {
     T_Read0         = 1 << 0,
@@ -210,6 +210,8 @@ enum {
     T_SetMaybeC     = 1 << 18,
     T_ReadC         = 1 << 19,
     T_SetC          = 1 << 20,
+    
+    T_WriteMem      = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
 const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
 
-const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG);
-const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG);
+const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
+const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
 const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
 const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
-const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG);
+const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
 const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
 const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
 const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
 
-const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM);
+const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
 const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
-const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM);
+const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
 const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
-const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM);
+const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
 const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
 
-const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL);
+const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
 const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
 
-const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH);
+const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
 const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
 
 const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
-const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA);
+const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
 const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX);
@@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
     if (thumb)
     {
         u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF];
-        res.Kind = (data >> 21) & 0x3F;
+        res.Kind = (data >> 22) & 0x3F;
 
         if (data & T_Read0)
             res.SrcRegs |= 1 << (instr & 0x7);
@@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_SetC)
             res.WriteFlags |= flag_C;
 
+        if (data & T_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
             u32 id = (cn<<8)|(cm<<4)|cpinfo;
             if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752)
                 res.EndBlock |= true;
+
+            if (id == 0x704 || id == 0x782)
+                res.SpecialKind = special_WaitForInterrupt;
         }
         if (res.Kind == ak_MCR || res.Kind == ak_MRC)
         {
@@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))
             res.WriteFlags |= flag_C;
 
+        if (data & A_WriteMem)
+            res.SpecialKind = special_WriteMem;
+
         if ((instr >> 28) < 0xE)
         {
             // make non conditional flag sets conditional
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index d01c600..d02f168 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -226,18 +226,27 @@ enum
     flag_V = 1 << 0,
 };
 
+enum
+{
+    special_NotSpecialAtAll = 0,
+    special_WriteMem,
+    special_WaitForInterrupt
+};
+
 struct Info
 {
     u16 DstRegs, SrcRegs;
     u16 Kind;
 
+    u8 SpecialKind;
+
     u8 ReadFlags;
     // lower 4 bits - set always
     // upper 4 bits - might set flag
     u8 WriteFlags;
 
     bool EndBlock;
-    bool Branches()
+    bool Branches() const
     {
         return DstRegs & (1 << 15);
     }
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 5b5f935..8a9b31d 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -562,9 +562,11 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
 
     case 0x750:
+        ARMJIT::InvalidateAll();
         ICacheInvalidateAll();
         return;
     case 0x751:
+        ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val));
         ICacheInvalidateByAddr(val);
         return;
     case 0x752:
@@ -814,7 +816,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
         DataCycles = 1;
         *(u8*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -838,7 +840,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
         DataCycles = 1;
         *(u16*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -862,8 +864,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
         DataCycles = 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
@@ -887,8 +888,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
         DataCycles += 1;
         *(u32*)&ITCM[addr & 0x7FFF] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL;
-        ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL;
+        ARMJIT::InvalidateITCM(addr & 0x7FFF);
 #endif
         return;
     }
diff --git a/src/Config.cpp b/src/Config.cpp
index 33bab75..c117a41 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -40,6 +40,7 @@ char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
+bool JIT_BrancheOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -56,6 +57,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 9296335..c9013aa 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -54,6 +54,7 @@ extern char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
+extern bool JIT_BrancheOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 0bde139..0cfbd1a 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -575,7 +575,7 @@ void Reset()
     RCnt = 0;
 
 #ifdef JIT_ENABLED
-    ARMJIT::InvalidateBlockCache();
+    ARMJIT::ResetBlockCache();
 #endif
 
     NDSCart::Reset();
@@ -807,7 +807,7 @@ bool DoSavestate(Savestate* file)
 #ifdef JIT_ENABLED
     if (!file->Saving)
     {
-        ARMJIT::InvalidateBlockCache();
+        ARMJIT::ResetBlockCache();
     }
 #endif
 
@@ -2016,10 +2016,6 @@ u32 ARM9Read32(u32 addr)
 
 void ARM9Write8(u32 addr, u8 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2070,10 +2066,6 @@ void ARM9Write8(u32 addr, u8 val)
 
 void ARM9Write16(u32 addr, u16 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2140,10 +2132,6 @@ void ARM9Write16(u32 addr, u16 val)
 
 void ARM9Write32(u32 addr, u32 val)
 {
-#ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<0>(addr);
-#endif
-
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
@@ -2439,7 +2427,7 @@ u32 ARM7Read32(u32 addr)
 void ARM7Write8(u32 addr, u8 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2502,7 +2490,7 @@ void ARM7Write8(u32 addr, u8 val)
 void ARM7Write16(u32 addr, u16 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate16<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
@@ -2575,7 +2563,7 @@ void ARM7Write16(u32 addr, u16 val)
 void ARM7Write32(u32 addr, u32 val)
 {
 #ifdef JIT_ENABLED
-    ARMJIT::Invalidate32<1>(addr);
+    ARMJIT::InvalidateByAddr7(addr);
 #endif
 
     switch (addr & 0xFF800000)
-- 
cgit v1.2.3


From 441869a10567c2da3de210052cbe93d783a9ce83 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Fri, 18 Oct 2019 13:29:17 +0200
Subject: integrate changes from ARM64 backend and more - better handle LDM/STM
 in reg alloc - unify Halted and IRQ in anticipation for branch inlining -
 literal optimisations can be disabled in gui - jit blocks follow simple
 returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in
 Pokemon White)

---
 src/ARM.cpp                         | 40 ++++++++++++++++++-----------
 src/ARM.h                           | 13 +++++++---
 src/ARMJIT.cpp                      | 50 +++++++++++++++++++++++++++++++------
 src/ARMJIT_RegisterCache.h          | 33 +++++++++++++++++++-----
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  7 +++---
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++----
 src/ARM_InstrInfo.cpp               | 28 +++++++++++++++++++++
 src/ARM_InstrInfo.h                 |  2 +-
 src/Config.cpp                      |  2 ++
 src/Config.h                        |  1 +
 src/NDS.cpp                         |  4 +--
 11 files changed, 153 insertions(+), 43 deletions(-)

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 1e75301..2f4aa90 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -159,7 +159,7 @@ void ARM::DoSavestate(Savestate* file)
 
     file->Var32((u32*)&Cycles);
     //file->Var32((u32*)&CyclesToRun);
-    file->Var32(&Halted);
+    file->Var32(&StopExecution);
 
     file->VarArray(R, 16*sizeof(u32));
     file->Var32(&CPSR);
@@ -632,16 +632,21 @@ void ARMv5::ExecuteJIT()
         NDS::ARM9Timestamp += Cycles;
         Cycles = 0;
 
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM9Timestamp = NDS::ARM9Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target)
+                {
+                    NDS::ARM9Timestamp = NDS::ARM9Target;
+                }
+                break;
             }
-            break;
         }
     }
 
@@ -769,16 +774,21 @@ void ARMv4::ExecuteJIT()
         Cycles = 0;
 
         // TODO optimize this shit!!!
-        if (IRQ) TriggerIRQ();
-        if (Halted)
+        if (StopExecution)
         {
-            bool idleLoop = Halted & 0x20;
-            Halted &= ~0x20;
-            if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+            if (IRQ)
+                TriggerIRQ();
+
+            if (Halted || IdleLoop)
             {
-                NDS::ARM7Timestamp = NDS::ARM7Target;
+                bool idleLoop = IdleLoop;
+                IdleLoop = 0;
+                if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target)
+                {
+                    NDS::ARM7Timestamp = NDS::ARM7Target;
+                }
+                break;
             }
-            break;
         }
     }
 
diff --git a/src/ARM.h b/src/ARM.h
index b36120a..96dd857 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -112,9 +112,16 @@ public:
     u32 Num;
 
     s32 Cycles;
-    u32 Halted;
-
-    u32 IRQ; // nonzero to trigger IRQ
+    union
+    {
+        struct
+        {
+            u8 Halted;
+            u8 IRQ; // nonzero to trigger IRQ
+            u8 IdleLoop;
+        };
+        u32 StopExecution;
+    };
 
     u32 CodeRegion;
     s32 CodeCycles;
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 19a5e70..0695b85 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -16,11 +16,13 @@
 #include "GPU3D.h"
 #include "SPU.h"
 #include "Wifi.h"
+#include "NDSCart.h"
 
 namespace ARMJIT
 {
 
 #define JIT_DEBUGPRINT(msg, ...)
+//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__)
 
 Compiler* compiler;
 
@@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
 	}
 }
 
-bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr)
+bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, 
+	u32& linkAddr, u32& targetAddr)
 {
 	if (thumb)
 	{
 		u32 r15 = instr.Addr + 4;
 		cond = 0xE;
 
+		link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG;
+		linkAddr = instr.Addr + 4;
+
 		if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12)))
 		{
 			targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9);
@@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	else
 	{
+		link = instr.Info.Kind == ARMInstrInfo::ak_BL;
+		linkAddr = instr.Addr + 4;
+
 		cond = instr.Cond();
 		if (instr.Info.Kind == ARMInstrInfo::ak_BL 
 			|| instr.Info.Kind == ARMInstrInfo::ak_B)
@@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA
 			targetAddr = r15 + offset;
 			return true;
 		}
+		else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14)
+		{
+			JIT_DEBUGPRINT("returning!\n");
+			targetAddr = lr;
+			return true;
+		}
 	}
 	return false;
 }
@@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
+	u32 lr;
+	bool hasLink = false;
 
     do
     {
@@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
+		if (instrs[i].Info.DstRegs & (1 << 14))
+			hasLink = false;
+
 		if (thumb)
 		{
 			InterpretTHUMB[instrs[i].Info.Kind](cpu);
@@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
-			u32 cond, target;
-			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target);
+			bool link;
+			u32 cond, target, linkAddr;
+			bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target);
 			JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched);
 
 			if (staticBranch)
@@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu)
 				if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart)
 				{
 					// we might have an idle loop
-					u32 offset = (target - blockAddr) / (thumb ? 2 : 4);
-					if (IsIdleLoop(instrs + offset, i - offset + 1))
+					u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4);
+					if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1))
 					{
 						instrs[i].BranchFlags |= branch_IdleBranch;
 						JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr);
 					}
 				}
-				else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
+				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
 					u32 targetPseudoPhysical = cpu->Num == 0
 						? TranslateAddr<0>(target)
 						: TranslateAddr<1>(target);
+
+					if (link)
+					{
+						lr = linkAddr;
+						hasLink = true;
+					}
 					
 					r15 = target + (thumb ? 2 : 4);
 					assert(r15 == cpu->R[15]);
@@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu)
 		bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken));
 		if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
-    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted);
+    } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
 	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
 	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
@@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	{
 		if ((addr & 0xFF000000) == 0x04000000)
 		{
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
 			/*
 				unfortunately we can't map GPU2D this way
 				since it's hidden inside an object
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index ed6a2b7..2222bc2 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -93,10 +93,12 @@ public:
 
 	void Prepare(bool thumb, int i)
     {
+        FetchedInstr instr = Instrs[i];
+
         if (LoadedRegs & (1 << 15))
             UnloadRegister(15);
 
-        BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs);
+        BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs);
         for (int reg : invalidedLiterals)
             UnloadLiteral(reg);
 
@@ -108,6 +110,7 @@ public:
         {
             BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs);
             futureNeeded |= regsNeeded.m_val;
+            regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded);
             for (int reg : regsNeeded)
                 ranking[reg]++;
         }
@@ -117,8 +120,8 @@ public:
         for (int reg : neverNeededAgain)
             UnloadRegister(reg);
 
-        FetchedInstr Instr = Instrs[i];
-        u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs;
+        u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded;
+        u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded;
         BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs);
         if (needToBeLoaded != BitSet16(0))
         {
@@ -143,13 +146,31 @@ public:
                 loadedSet.m_val = LoadedRegs;
             }
 
+            // we don't need to load a value which is always going to be overwritten
             BitSet16 needValueLoaded(needToBeLoaded);
-            if (thumb || Instr.Cond() >= 0xE)
-                needValueLoaded = BitSet16(Instr.Info.SrcRegs);
+            if (thumb || instr.Cond() >= 0xE)
+                needValueLoaded = BitSet16(instr.Info.SrcRegs);
             for (int reg : needToBeLoaded)
                 LoadRegister(reg, needValueLoaded[reg]);
+        } 
+        {
+            BitSet16 loadedSet(LoadedRegs);
+            BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs);
+            if (loadRegs && loadedSet.Count() < NativeRegsAvailable)
+            {
+                int left = NativeRegsAvailable - loadedSet.Count();
+                for (int reg : loadRegs)
+                {
+                    if (left-- == 0)
+                        break;
+
+                    writeRegs |= (1 << reg) & instr.Info.DstRegs;
+                    LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs);
+                }
+            }
         }
-        DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15);
+
+        DirtyRegs |= writeRegs & ~(1 << 15);
     }
 
 	static const Reg NativeRegAllocOrder[];
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index a994d34..fd38724 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -364,7 +364,7 @@ void Compiler::Reset()
 void Compiler::Comp_SpecialBranchBehaviour()
 {
     if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20));
+        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
     if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
     {
@@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     {
         CurInstr = instrs[i];
         R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+        CodeRegion = R15 >> 24;
 
         Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
 
@@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI()
         Comp_AddCycles_CD();
     else
     {
-        IrregularCycles = true;
-
         s32 cycles;
 
         s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
@@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD()
         IrregularCycles = true;
     }
 
-    if (!Thumb && CurInstr.Cond() < 0xE)
+    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
         ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index eb01c87..3799774 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,6 @@
 #include "ARMJIT_Compiler.h"
 
+#include "../Config.h"
 
 using namespace Gen;
 
@@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
     if (size == 16)
         addressMask = ~1;
 
-    if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback)))
     {
         u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         Comp_MemLoadLiteral(size, rd, addr);
@@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         OpArg rdMapped = MapReg(rd);
         OpArg rnMapped = MapReg(rn);
+        if (Thumb && rn == 15)
+            rnMapped = Imm32(R15 & ~0x2);
 
         bool inlinePreparation = Num == 1;
         u32 constLocalROR32 = 4;
@@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)]
             : MemoryFuncs7[size >> 4][!!((flags & memop_Store))];
 
-        if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
+        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn))
         {
             u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
@@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
-
-    Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    u32 offset = (CurInstr.Instr & 0xFF) << 2;
+    u32 addr = (R15 & ~0x2) + offset;
+    if (Config::JIT_LiteralOptimisations)
+        Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr);
+    else
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 1261bbe..8f8bd35 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
             res.SpecialKind = special_LoadLiteral;
 
+        if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            res.NotStrictlyNeeded |= set;
+            res.DstRegs |= set;
+        }
+        if (res.Kind == tk_STMIA || res.Kind == tk_PUSH)
+        {
+            u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs);
+            if (res.Kind == tk_PUSH && instr & (1 << 8))
+                set |= (1 << 14);
+            res.NotStrictlyNeeded |= set;
+            res.SrcRegs |= set;
+        }
+
         res.EndBlock |= res.Branches();
 
         if (res.Kind == tk_BCOND)
@@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr)
 
         if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
             res.SpecialKind = special_LoadLiteral;
+        
+        if (res.Kind == ak_LDM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.DstRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
+        if (res.Kind == ak_STM)
+        {
+            u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15));
+            res.SrcRegs |= set;
+            res.NotStrictlyNeeded |= set;
+        }
 
         if ((instr >> 28) < 0xE)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index c032a4f..2732181 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -236,7 +236,7 @@ enum
 
 struct Info
 {
-    u16 DstRegs, SrcRegs;
+    u16 DstRegs, SrcRegs, NotStrictlyNeeded;
     u16 Kind;
 
     u8 SpecialKind;
diff --git a/src/Config.cpp b/src/Config.cpp
index c117a41..a7d78cd 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -41,6 +41,7 @@ char DSiNANDPath[1024];
 bool JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
 bool JIT_BrancheOptimisations = true;
+bool JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -58,6 +59,7 @@ ConfigEntry ConfigFile[] =
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
     {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index c9013aa..1fcd9bb 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -55,6 +55,7 @@ extern char DSiNANDPath[1024];
 extern bool JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern bool JIT_BrancheOptimisations;
+extern bool JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 0cfbd1a..7b6a450 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1211,9 +1211,9 @@ void UpdateIRQ(u32 cpu)
 
     if (IME[cpu] & 0x1)
     {
-        arm->IRQ = IE[cpu] & IF[cpu];
+        arm->IRQ = !!(IE[cpu] & IF[cpu]);
         if ((ConsoleType == 1) && cpu)
-            arm->IRQ |= (IE2 & IF2);
+            arm->IRQ |= !!(IE2 & IF2);
     }
     else
     {
-- 
cgit v1.2.3


From 1c07932b40e6e072c6ea66c49889860252e45186 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sat, 25 Apr 2020 13:40:51 +0200
Subject: implement block linking + some refactoring currently only supported
 for x64

---
 .gitignore                           |    2 +
 src/ARM.cpp                          |   37 +-
 src/ARM.h                            |   32 +-
 src/ARMJIT.cpp                       |  223 +++-
 src/ARMJIT.h                         |   10 +-
 src/ARMJIT_Internal.h                |   24 +-
 src/ARMJIT_x64/ARMJIT_Branch.cpp     |   23 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp   |  140 +-
 src/ARMJIT_x64/ARMJIT_Compiler.h     |   19 +-
 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp |   15 +
 src/ARMJIT_x64/ARMJIT_Linkage.s      |   74 ++
 src/ARMJIT_x64/ARMJIT_Offsets.h      |    3 +
 src/CMakeLists.txt                   |    6 +
 src/Config.cpp                       |    8 +-
 src/Config.h                         |    6 +-
 src/xxhash/xxh3.h                    | 2390 ++++++++++++++++++++++++++++++++++
 src/xxhash/xxhash.c                  |   43 +
 src/xxhash/xxhash.h                  | 1965 ++++++++++++++++++++++++++++
 18 files changed, 4870 insertions(+), 150 deletions(-)
 create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
 create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h
 create mode 100644 src/xxhash/xxh3.h
 create mode 100644 src/xxhash/xxhash.c
 create mode 100644 src/xxhash/xxhash.h

(limited to 'src/Config.h')

diff --git a/.gitignore b/.gitignore
index dd81614..3c87740 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ melon_grc.h
 cmake-build
 cmake-build-debug
 .idea
+
+*.exe
diff --git a/src/ARM.cpp b/src/ARM.cpp
index 896bb5c..3eac74d 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -252,15 +252,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (addr & 0x2)
         {
             NextInstr[0] = CodeRead32(addr-2, true) >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
             NextInstr[1] = CodeRead32(addr+2, false);
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
         else
         {
             NextInstr[0] = CodeRead32(addr, true);
             NextInstr[1] = NextInstr[0] >> 16;
-            Cycles += CodeCycles;
+            Cycles -= CodeCycles;
         }
 
         CPSR |= 0x20;
@@ -273,9 +273,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr)
         if (newregion != oldregion) SetupCodeMem(addr);
 
         NextInstr[0] = CodeRead32(addr, true);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
         NextInstr[1] = CodeRead32(addr+4, false);
-        Cycles += CodeCycles;
+        Cycles -= CodeCycles;
 
         CPSR &= ~0x20;
     }
@@ -315,7 +315,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead16(addr);
         NextInstr[1] = CodeRead16(addr+2);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1];
 
         CPSR |= 0x20;
     }
@@ -328,7 +328,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr)
 
         NextInstr[0] = CodeRead32(addr);
         NextInstr[1] = CodeRead32(addr+4);
-        Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3];
 
         CPSR &= ~0x20;
     }
@@ -587,7 +587,7 @@ void ARMv5::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM9Timestamp += Cycles;
+        NDS::ARM9Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -627,14 +627,16 @@ void ARMv5::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr);
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
 
         if (StopExecution)
         {
@@ -728,7 +730,7 @@ void ARMv4::Execute()
         }*/
         if (IRQ) TriggerIRQ();
 
-        NDS::ARM7Timestamp += Cycles;
+        NDS::ARM7Timestamp -= Cycles;
         Cycles = 0;
     }
 
@@ -768,14 +770,15 @@ void ARMv4::ExecuteJIT()
             return;
         }
 
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr);
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr));
         if (block)
-            Cycles += block();
+            ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp += Cycles;
-        Cycles = 0;
+        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index ccef265..b71102a 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -193,14 +193,14 @@ public:
     {
         // code only. always nonseq 32-bit for ARM9.
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC;
+        Cycles -= numC;
     }
 
     void AddCycles_CI(s32 numI)
     {
         // code+internal
         s32 numC = (R[15] & 0x2) ? 0 : CodeCycles;
-        Cycles += numC + numI;
+        Cycles -= numC + numI;
     }
 
     void AddCycles_CDI()
@@ -211,9 +211,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void AddCycles_CD()
@@ -223,9 +223,9 @@ public:
         s32 numD = DataCycles;
 
         //if (DataRegion != CodeRegion)
-            Cycles += std::max(numC + numD - 6, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 6, std::max(numC, numD));
         //else
-        //    Cycles += numC + numD;
+        //    Cycles -= numC + numD;
     }
 
     void GetCodeMemRegion(u32 addr, NDS::MemRegion* region);
@@ -387,13 +387,13 @@ public:
     void AddCycles_C()
     {
         // code only. this code fetch is sequential.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3];
     }
 
     void AddCycles_CI(s32 num)
     {
         // code+internal. results in a nonseq code fetch.
-        Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
+        Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num;
     }
 
     void AddCycles_CDI()
@@ -405,21 +405,21 @@ public:
         if ((DataRegion >> 4) == 0x02) // mainRAM
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
             {
                 numC++;
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
             }
         }
         else if (CodeRegion == 0x02)
         {
             numD++;
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD + 1;
+            Cycles -= numC + numD + 1;
         }
     }
 
@@ -432,17 +432,17 @@ public:
         if ((DataRegion >> 4) == 0x02)
         {
             if (CodeRegion == 0x02)
-                Cycles += numC + numD;
+                Cycles -= numC + numD;
             else
-                Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+                Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else if (CodeRegion == 0x02)
         {
-            Cycles += std::max(numC + numD - 3, std::max(numC, numD));
+            Cycles -= std::max(numC + numD - 3, std::max(numC, numD));
         }
         else
         {
-            Cycles += numC + numD;
+            Cycles -= numC + numD;
         }
     }
 };
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 208801e..cc8d4ce 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -2,6 +2,10 @@
 
 #include <string.h>
 #include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
 
 #include "Config.h"
 
@@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = {
 u32 AddrTranslate9[0x2000];
 u32 AddrTranslate7[0x4000];
 
-JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
-TinyVector<JitBlock*> JitBlocks;
-JitBlock* RestoreCandidates[0x1000] = {NULL};
+std::unordered_map<u32, JitBlock*> JitBlocks;
 
-u32 HashRestoreCandidate(u32 pseudoPhysicalAddr)
+template <typename K, typename V, int Size, V InvalidValue>
+struct UnreliableHashTable
 {
-	return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53);
-}
+	struct Bucket
+	{
+		K KeyA, KeyB;
+		V ValA, ValB;
+	};
+
+	Bucket Table[Size];
+
+	void Reset()
+	{
+		for (int i = 0; i < Size; i++)
+		{
+			Table[i].ValA = Table[i].ValB = InvalidValue;
+		}
+	}
+
+	UnreliableHashTable()
+	{
+		Reset();
+	}
+
+	V Insert(K key, V value)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA == value || bucket->ValB == value)
+		{
+			return InvalidValue;
+		}
+		else if (bucket->ValA == InvalidValue)
+		{
+			bucket->KeyA = key;
+			bucket->ValA = value;
+		}
+		else if (bucket->ValB == InvalidValue)
+		{
+			bucket->KeyB = key;
+			bucket->ValB = value;
+		}
+		else
+		{
+			V prevVal = bucket->ValB;
+			bucket->KeyB = bucket->KeyA;
+			bucket->ValB = bucket->ValA;
+			bucket->KeyA = key;
+			bucket->ValA = value;
+			return prevVal;
+		}
+
+		return InvalidValue;
+	}
+
+	void Remove(K key)
+	{
+		u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->KeyA == key && bucket->ValA != InvalidValue)
+		{
+			bucket->ValA = InvalidValue;
+			if (bucket->ValB != InvalidValue)
+			{
+				bucket->KeyA = bucket->KeyB;
+				bucket->ValA = bucket->ValB;
+				bucket->ValB = InvalidValue;
+			}
+		}
+		if (bucket->KeyB == key && bucket->ValB != InvalidValue)
+			bucket->ValB = InvalidValue;
+	}
+
+	V LookUp(K addr)
+	{
+		u32 slot = XXH3_64bits(&addr, 4) & (Size - 1);
+		Bucket* bucket = &Table[slot];
+
+		if (bucket->ValA != InvalidValue && bucket->KeyA == addr)
+			return bucket->ValA;
+		if (bucket->ValB != InvalidValue && bucket->KeyB == addr)
+			return bucket->ValB;
+
+		return InvalidValue;
+	}
+};
+
+UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
+UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp;
 
 void Init()
 {
@@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n",
-		blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], 
-		cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr),
+	JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n",
+		blockAddr, cpu->CPSR, pseudoPhysicalAddr,
 		CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated);
 
 	u32 lastSegmentStart = blockAddr;
@@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu)
 
 			if (staticBranch)
 			{
+				instrs[i].BranchFlags |= branch_StaticTarget;
+
 				bool isBackJump = false;
 				if (hasBranched)
 				{
@@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu)
 			FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF);
     } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80)));
 
-	u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr);
-	JitBlock* prevBlock = RestoreCandidates[restoreSlot];
+	JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr);
 	bool mayRestore = true;
-	if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr)
+	if (prevBlock)
 	{
-		RestoreCandidates[restoreSlot] = NULL;	
+		RestoreCandidates.Remove(pseudoPhysicalAddr);
 		if (prevBlock->NumInstrs == i)
 		{
 			for (int j = 0; j < i; j++)
@@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu)
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i);
+		block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu)
 		CodeRanges[addresseRanges[j] / 512].Blocks.Add(block);
 	}
 
-	FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint;
-
-	JitBlocks.Add(block);
+	JitBlocks[pseudoPhysicalAddr] = block;
+	FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint));
 }
 
 void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
@@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore)
 			}
 		}
 
-		bool removed = JitBlocks.RemoveByValue(block);
-		assert(removed);
+		for (int j = 0; j < block->NumLinks(); j++)
+			compiler->UnlinkBlock(block->Links()[j]);
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
+		JitBlocks.erase(block->PseudoPhysicalAddr);
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
 
 		if (mayRestore)
 		{
-			u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-			if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-				delete RestoreCandidates[slot];
-
-			RestoreCandidates[slot] = block;
+			JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+			if (prevBlock)
+				delete prevBlock;
 		}
 	}
 	if ((range->TimesInvalidated + 1) > range->TimesInvalidated)
@@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr)
 void InvalidateAll()
 {
 	JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length);
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 
-		FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL;
-		
-		for (int j = 0; j < block->NumAddresses; j++)
+		FastBlockLookUp.Remove(block->PseudoPhysicalAddr);
+
+		for (int i = 0; i < block->NumAddresses; i++)
 		{
-			u32 addr = block->AddressRanges()[j];
+			u32 addr = block->AddressRanges()[i];
 			AddressRange* range = &CodeRanges[addr / 512];
 			range->Blocks.Clear();
 			if (range->TimesInvalidated + 1 > range->TimesInvalidated)
 				range->TimesInvalidated++;
 		}
+		for (int i = 0; i < block->NumLinks(); i++)
+			compiler->UnlinkBlock(block->Links()[i]);
+		block->ResetLinks();
 
-		u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr);
-		if (RestoreCandidates[slot] && RestoreCandidates[slot] != block)
-			delete RestoreCandidates[slot];
-		
-		RestoreCandidates[slot] = block;
+		JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block);
+		if (prevBlock)
+			delete prevBlock;
 	}
 
-	JitBlocks.Clear();
+	JitBlocks.clear();
 }
 
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
-	
-	memset(FastBlockAccess, 0, sizeof(FastBlockAccess));
-	for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++)
+
+	FastBlockLookUp.Reset();
+	RestoreCandidates.Reset();
+	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
-		if (RestoreCandidates[i])
+		if (RestoreCandidates.Table[i].ValA)
 		{
-			delete RestoreCandidates[i];
-			RestoreCandidates[i] = NULL;
+			delete RestoreCandidates.Table[i].ValA;
+			RestoreCandidates.Table[i].ValA = NULL;
+		}
+		if (RestoreCandidates.Table[i].ValA)
+		{
+			delete RestoreCandidates.Table[i].ValB;
+			RestoreCandidates.Table[i].ValB = NULL;
 		}
 	}
-	for (int i = 0; i < JitBlocks.Length; i++)
+	for (auto it : JitBlocks)
 	{
-		JitBlock* block = JitBlocks[i];
+		JitBlock* block = it.second;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
@@ -788,11 +882,43 @@ void ResetBlockCache()
 		}
 		delete block;
 	}
-	JitBlocks.Clear();
+	JitBlocks.clear();
 
 	compiler->Reset();
 }
 
+JitBlockEntry LookUpBlockEntry(u32 addr)
+{
+	u32 entryOffset = FastBlockLookUp.LookUp(addr);
+	if (entryOffset != UINT32_MAX)
+		return compiler->AddEntryOffset(entryOffset);
+
+	auto block = JitBlocks.find(addr);
+	if (block != JitBlocks.end())
+	{
+		FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint));
+		return block->second->EntryPoint;
+	}
+	return NULL;
+}
+
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset)
+{
+	u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4));
+	auto block = JitBlocks.find(targetPseudoPhys);
+	if (block == JitBlocks.end())
+	{
+		CompileBlock(cpu);
+		block = JitBlocks.find(targetPseudoPhys);
+	}
+
+	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
+
+	block->second->AddLink(codeOffset);
+	compiler->LinkBlock(codeOffset, block->second->EntryPoint);
+}
+
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 {
 	if (cpu->Num == 0)
@@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 	return NULL;
 }
 
-}
\ No newline at end of file
+}
+
+template void ARMJIT::LinkBlock<0>(ARM*, u32);
+template void ARMJIT::LinkBlock<1>(ARM*, u32);
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 09cc463..cab385f 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000];
 extern u32 AddrTranslate7[0x4000];
 
 const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
 
 template <u32 num>
 inline bool IsMapped(u32 addr)
@@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr)
 		return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
 }
 
-template <u32 num>
-inline JitBlockEntry LookUpBlock(u32 addr)
-{
-	return FastBlockAccess[TranslateAddr<num>(addr) / 2];
-}
+JitBlockEntry LookUpBlockEntry(u32 addr);
+
 
 void Init();
 void DeInit();
@@ -73,4 +69,6 @@ void ResetBlockCache();
 
 }
 
+extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry);
+
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 0d6add9..66d1808 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -15,7 +15,8 @@ enum
 {
 	branch_IdleBranch = 1 << 0,
 	branch_FollowCondTaken = 1 << 1,
-	branch_FollowCondNotTaken = 1 << 2
+	branch_FollowCondNotTaken = 1 << 2,
+	branch_StaticTarget = 1 << 3,
 };
 
 struct FetchedInstr
@@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector
 		assert(capacity > Capacity);
 		T* newMem = new T[capacity];
 		if (Data != NULL)
-			memcpy(newMem, Data, sizeof(Data) * Length);
+			memcpy(newMem, Data, sizeof(T) * Length);
 
 		T* oldData = Data;
 		Data = newMem;
@@ -163,7 +164,6 @@ public:
 	
 	u32 NumInstrs;
 	u32 NumAddresses;
-	u32 NumLinks;
 
 	JitBlockEntry EntryPoint;
 
@@ -171,6 +171,21 @@ public:
 	{ return &Data[0]; }
 	u32* AddressRanges()
 	{ return &Data[NumInstrs]; }
+	u32* Links()
+	{ return &Data[NumInstrs + NumAddresses]; }
+
+	u32 NumLinks()
+	{ return Data.Length - NumInstrs - NumAddresses; }
+
+	void AddLink(u32 link)
+	{
+		Data.Add(link);
+	}
+
+	void ResetLinks()
+	{
+		Data.SetLength(NumInstrs + NumAddresses);
+	}
 
 private:
 	/*
@@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000];
 
 void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
 
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
+
 }
 
 #endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index e02865d..cac590a 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
 }
 
 void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
@@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
     IrregularCycles = true;
 
     BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
-    bool previouslyDirty = CPSRDirty;
+    bool cpsrDirty = CPSRDirty;
     SaveCPSR();
 
     if (restoreCPSR)
@@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR)
             LoadReg(reg, RegCache.Mapping[reg]);
     }
 
-    if (previouslyDirty)
-        LoadCPSR();
-    CPSRDirty = previouslyDirty;
+    LoadCPSR();
+    // in case this instruction is skipped
+    if (CurInstr.Cond() < 0xE)
+        CPSRDirty = cpsrDirty;
 }
 
 void Compiler::A_Comp_BranchImm()
@@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_SpecialBranchBehaviour();
+    Comp_SpecialBranchBehaviour(true);
 
     FixupBranch skipFailed = J();
     SetJumpTarget(skipExecute);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
-    }
+    Comp_SpecialBranchBehaviour(false);
 
     Comp_AddCycles_C(true);
     SetJumpTarget(skipFailed);
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index d69bdff..be3709e 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -1,6 +1,7 @@
 #include "ARMJIT_Compiler.h"
 
 #include "../ARMInterpreter.h"
+#include "../Config.h"
 
 #include <assert.h>
 
@@ -15,6 +16,8 @@
 
 using namespace Gen;
 
+extern "C" void ARM_Ret();
+
 namespace ARMJIT
 {
 template <>
@@ -170,6 +173,24 @@ Compiler::Compiler()
         RET();
     }
 
+    {
+        CPSRDirty = true;
+        BranchStub[0] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<0>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+
+        CPSRDirty = true;
+        BranchStub[1] = GetWritableCodePtr();
+        SaveCPSR();
+        MOV(64, R(ABI_PARAM1), R(RCPU));
+        CALL((u8*)ARMJIT::LinkBlock<1>);
+        LoadCPSR();
+        JMP((u8*)ARM_Ret, true);
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -362,23 +383,43 @@ void Compiler::Reset()
     SetCodePtr(ResetStart);
 }
 
-void Compiler::Comp_SpecialBranchBehaviour()
+void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
-        OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
+        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
         RegCache.PrepareExit();
-        SaveCPSR(false);
-        
-        MOV(32, R(RAX), Imm32(ConstantCycles));
-        ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-        RET();
+
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
+            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
+        {
+            FixupBranch ret = J_CC(CC_S);
+            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+            FixupBranch ret2 = J_CC(CC_NZ);
+
+            u8* rewritePart = GetWritableCodePtr();
+            NOP(5);
+
+            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+            JMP((u8*)BranchStub[Num], true);
+
+            SetJumpTarget(ret);
+            SetJumpTarget(ret2);
+            JMP((u8*)ARM_Ret, true);
+        }
+        else
+        {
+            JMP((u8*)&ARM_Ret, true);
+        }
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess...
         ResetBlockCache();
@@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     Num = cpu->Num;
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
+    // CPSR might have been modified in a previous block
+    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
-    ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-
-    MOV(64, R(RCPU), ImmPtr(cpu));
-
-    LoadCPSR();
-
     RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);
 
     for (int i = 0; i < instrsCount; i++)
@@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                 else
                     (this->*comp)();
 
-                Comp_SpecialBranchBehaviour();
+                Comp_SpecialBranchBehaviour(true);
 
                 if (CurInstr.Cond() < 0xE)
                 {
@@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
                         Comp_AddCycles_C(true);
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            RegCache.PrepareExit();
-                            SaveCPSR(false);
-                            
-                            MOV(32, R(RAX), Imm32(ConstantCycles));
-                            ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-                            RET();
-                        }
+                        Comp_SpecialBranchBehaviour(false);
 
                         SetJumpTarget(skipFailed);
                     }
@@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
             }
         }
 
-        if (comp == NULL && i != instrsCount - 1)
+        if (comp == NULL)
             LoadCPSR();
     }
 
     RegCache.Flush();
-    SaveCPSR();
 
-    MOV(32, R(RAX), Imm32(ConstantCycles));
+    SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
+
+    if (Config::JIT_BrancheOptimisations == 2
+        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
+        && (!instrs[instrsCount - 1].Info.Branches()
+        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
+        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
+    {
+        FixupBranch ret = J_CC(CC_S);
+        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
+        FixupBranch ret2 = J_CC(CC_NZ);
+
+        u8* rewritePart = GetWritableCodePtr();
+        NOP(5);
+
+        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
+        JMP((u8*)BranchStub[Num], true);
 
-    ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8);
-    RET();
+        SetJumpTarget(ret);
+        SetJumpTarget(ret2);
+        JMP((u8*)ARM_Ret, true);
+    }
+    else
+    {
+        JMP((u8*)ARM_Ret, true);
+    }
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     return res;
 }
 
+void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    JMP((u8*)entry, true);
+    SetCodePtr(curPtr);
+}
+
+void Compiler::UnlinkBlock(u32 offset)
+{
+    u8* curPtr = GetWritableCodePtr();
+    SetCodePtr(ResetStart + offset);
+    NOP(5);
+    SetCodePtr(curPtr);
+}
+
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
@@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
     if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i)
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;
 
     if (!Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
@@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
     if (!Thumb && CurInstr.Cond() < 0xE)
     {
         LEA(32, RSCRATCH, MDisp(i, add + cycles));
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
     }
     else
     {
         ConstantCycles += i + cycles;
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
     }
 }
 
@@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+            SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
         else
             ConstantCycles += cycles;
     }
@@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
-        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
+        SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 2cb57dc..b428c33 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -51,7 +51,10 @@ public:
 
     void Reset();
 
-    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    void LinkBlock(u32 offset, JitBlockEntry entry);
+    void UnlinkBlock(u32 offset);
+
+    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -145,7 +148,7 @@ public:
 
     void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed);
 
-    void Comp_SpecialBranchBehaviour();
+    void Comp_SpecialBranchBehaviour(bool taken);
 
     void* Gen_MemoryRoutine9(bool store, int size);
 
@@ -176,12 +179,24 @@ public:
         return Gen::R(RegCache.Mapping[reg]);
     }
 
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(ResetStart + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - ResetStart;
+    }
+
     u8* ResetStart;
     u32 CodeMemSize;
 
     bool Exit;
     bool IrregularCycles;
 
+    void* BranchStub[2];
+
     void* MemoryFuncs9[3][2];
     void* MemoryFuncs7[3][2];
 
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
new file mode 100644
index 0000000..9696d22
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -0,0 +1,15 @@
+#include "../ARM.h"
+
+int main(int argc, char* argv[])
+{
+    FILE* f = fopen("ARMJIT_Offsets.h", "w");
+#define writeOffset(field) \
+        fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field))
+
+    writeOffset(CPSR);
+    writeOffset(Cycles);
+    writeOffset(StopExecution);
+
+    fclose(f);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..dbbb024
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.s
@@ -0,0 +1,74 @@
+.intel_syntax noprefix
+
+#include "ARMJIT_Offsets.h"
+
+.text
+
+#define RCPU rbp
+#define RCPSR r15d
+
+#ifdef WIN64
+#define ARG1_REG ecx
+#define ARG2_REG edx
+#define ARG3_REG r8d
+#define ARG4_REG r9d
+#define ARG1_REG64 rcx
+#define ARG2_REG64 rdx
+#define ARG3_REG64 r8
+#define ARG4_REG64 r9
+#else
+#define ARG1_REG edi
+#define ARG2_REG esi
+#define ARG3_REG edx
+#define ARG4_REG ecx
+#define ARG1_REG64 rdi
+#define ARG2_REG64 rsi
+#define ARG3_REG64 rdx
+#define ARG4_REG64 rcx
+#endif
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+#ifdef WIN64
+    push rdi
+    push rsi
+#endif
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+    push rbp
+
+#ifdef WIN64
+    sub rsp, 0x28
+#endif
+    mov RCPU, ARG1_REG64
+    mov RCPSR, [RCPU + ARM_CPSR_offset]
+
+    jmp ARG2_REG64
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    mov [RCPU + ARM_CPSR_offset], RCPSR
+
+#ifdef WIN64
+    add rsp, 0x28
+#endif
+
+    pop rbp
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+#ifdef WIN64
+    pop rsi
+    pop rdi
+#endif
+
+    ret
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
new file mode 100644
index 0000000..a73dd59
--- /dev/null
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -0,0 +1,3 @@
+#define ARM_CPSR_offset 0x64
+#define ARM_Cycles_offset 0xc
+#define ARM_StopExecution_offset 0x10
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c34ba3b..a0c3a36 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -49,9 +49,12 @@ add_library(core STATIC
 	WifiAP.cpp
 	
 	tiny-AES-c/aes.c
+	xxhash/xxhash.c
 )
 
 if (ENABLE_JIT)
+	enable_language(ASM)
+
 	target_sources(core PRIVATE
 		ARMJIT.cpp
 
@@ -68,7 +71,10 @@ if (ENABLE_JIT)
 			ARMJIT_x64/ARMJIT_ALU.cpp
 			ARMJIT_x64/ARMJIT_LoadStore.cpp
 			ARMJIT_x64/ARMJIT_Branch.cpp
+
+			ARMJIT_x64/ARMJIT_Linkage.s
 		)
+		set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
 	endif()
 	if (ARCHITECTURE STREQUAL ARM64)
 		target_sources(core PRIVATE
diff --git a/src/Config.cpp b/src/Config.cpp
index 07b1e3e..e69319b 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -38,10 +38,10 @@ char DSiFirmwarePath[1024];
 char DSiNANDPath[1024];
 
 #ifdef JIT_ENABLED
-bool JIT_Enable = false;
+int JIT_Enable = false;
 int JIT_MaxBlockSize = 12;
-bool JIT_BrancheOptimisations = true;
-bool JIT_LiteralOptimisations = true;
+int JIT_BrancheOptimisations = 2;
+int JIT_LiteralOptimisations = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -58,7 +58,7 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0},
-    {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
 #endif
 
diff --git a/src/Config.h b/src/Config.h
index 1fcd9bb..d546524 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -52,10 +52,10 @@ extern char DSiFirmwarePath[1024];
 extern char DSiNANDPath[1024];
 
 #ifdef JIT_ENABLED
-extern bool JIT_Enable;
+extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern bool JIT_BrancheOptimisations;
-extern bool JIT_LiteralOptimisations;
+extern int JIT_BrancheOptimisations;
+extern int JIT_LiteralOptimisations;
 #endif
 
 }
diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h
new file mode 100644
index 0000000..5d5faf8
--- /dev/null
+++ b/src/xxhash/xxh3.h
@@ -0,0 +1,2390 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Development source file for `xxh3`
+ * Copyright (C) 2019-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Note: This file is separated for development purposes.
+ * It will be integrated into `xxhash.h` when development stage is completed.
+ *
+ * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
+ */
+
+#ifndef XXH3_H_1397135465
+#define XXH3_H_1397135465
+
+/* ===   Dependencies   === */
+#ifndef XXHASH_H_5627135585666179
+/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
+#  undef XXH_INLINE_ALL   /* avoid redefinition */
+#  define XXH_INLINE_ALL
+#endif
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0 /* Portable scalar version */
+#define XXH_SSE2   1 /* SSE2 for Pentium 4 and all x86_64 */
+#define XXH_AVX2   2 /* AVX2 for Haswell and Bulldozer */
+#define XXH_NEON   3 /* NEON for most ARMv7-A and all AArch64 */
+#define XXH_VSX    4 /* VSX and ZVector for POWER8/z13 */
+#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator.
+ * This is for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512 /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+#  undef vector /* Undo the pollution */
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*
+ * Performs an unaligned load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/* Pseudorandom secret taken directly from FARSH */
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+ * {
+ *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+ * }
+ */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*
+ * Calculates a 64->128-bit long multiply.
+ *
+ * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/* Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * We don't need to (or want to) mix as much as XXH64.
+ *
+ * Short hashes are more evenly distributed, so it isn't necessary.
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        xxh_u64 const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len < 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 x = input64 ^ bitflip;
+        /* this mix is inspired by Pelle Evensen's rrmxmx */
+        x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
+        x *= 0x9FB21C651E98DF25ULL;
+        x ^= (x >> 35) + len ;
+        x *= 0x9FB21C651E98DF25ULL;
+        return XXH_xorshift64(x, 28);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(8 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    __asm__ ("" : "+r" (seed64));
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc    =       (__m512i *) acc;
+
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        if (accWidth == XXH3_acc_128bits) {
+            /* xacc[0] += swap(data_vec); */
+            __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        } else {  /* XXH3_acc_64bits */
+            /* xacc[0] += data_vec; */
+            __m512i const sum = _mm512_add_epi64(*xacc, data_vec);
+            /* xacc[0] += product; */
+            *xacc = _mm512_add_epi64(product, sum);
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+                __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            if (accWidth == XXH3_acc_128bits) {
+                /* xacc[i] += swap(data_vec); */
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                /* xacc[i] += data_vec; */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                /* xacc[i] += product; */
+                xacc[i] = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+#ifdef __s390x__
+            xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
+#else
+            xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX512)
+
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+        size_t i;
+        for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+#ifdef __clang__ // for clang
+#  define XXH_PREFETCH_DIST_AVX512_64  320
+#  define XXH_PREFETCH_DIST_AVX512_128 320
+#else // for gcc
+#  define XXH_PREFETCH_DIST_AVX512_64  640
+#  define XXH_PREFETCH_DIST_AVX512_128 512
+#endif
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+#if (XXH_VECTOR == XXH_AVX512)
+        if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64);
+        else                             XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128);
+#else
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+#endif
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+            /* Do not align on 8, so that the secret is different from the scrambler */
+#define XXH_SECRET_LASTACC_START 7
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        __asm__("" : "+r" (result64));
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+    /*
+     * We need a separate pointer for the hack below.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8 *kSecretPtr = kSecret;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that kSecretPtr has been changed), the pipelines are used more efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    __asm__("" : "+r" (kSecretPtr));
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == kSecret);
+
+    for (i=0; i < nbRounds; i++) {
+        /*
+         * The asm hack causes Clang to assume that kSecretPtr aliases with
+         * customSecret, and on aarch64, this prevented LDP from merging two
+         * loads together for free. Putting the loads together before the stores
+         * properly generates LDP.
+         */
+        xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+        xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+        XXH_writeLE64(customSecret + 16*i,     lo);
+        XXH_writeLE64(customSecret + 16*i + 8, hi);
+    }
+}
+
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_64b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret));
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * There is some input left inside the internal buffer.
+         * Fill it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* Consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* Some remaining input: buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc,
+                              state->secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX: short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t h128;
+        h128.low64  = XXH3_avalanche(mixedl);
+        h128.high64 = XXH3_avalanche(mixedh);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
+            h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * PRIME64_1)
+                        + (acc.high64   * PRIME64_4)
+                        + ((len - seed) * PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128)
+         return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalizatiom routine.
+ */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         state->secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         state->secret + state->secretLimit + STRIPE_LEN
+                                                       - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH3_H_1397135465 */
diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c
new file mode 100644
index 0000000..0fae88c
--- /dev/null
+++ b/src/xxhash/xxhash.c
@@ -0,0 +1,43 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h
new file mode 100644
index 0000000..67a5887
--- /dev/null
+++ b/src/xxhash/xxhash.h
@@ -0,0 +1,1965 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+/*!
+ * XXH_NAMESPACE, aka Namespace Emulation:
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * XXH32():
+ *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ *  The memory between input & input+length must be valid (allocated and read-accessible).
+ *  "seed" can be used to alter the result predictably.
+ *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*!
+ * XXH64():
+ * Returns the 64-bit hash of sequence of length @length stored at memory
+ * address @input.
+ * @seed can be used to alter the result predictably.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ * and offers true 64/128 bit hash results. It provides a superior level of
+ * dispersion, and greatly reduces the risks of collisions.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation of an XXH
+ * state, for example, on the stack or in a struct.
+ * Never **ever** access members directly.
+ */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* ************************************************************************
+ * XXH3 is a new hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+ * faster on small ones compared to XXH64, though exact differences depend on
+ * the platform.
+ *
+ * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+ * on all platforms.
+ *
+ * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+ *
+ * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+ * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+ * explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The 128-bit version adds additional strength, but it is slightly slower.
+ *
+ * The XXH3 algorithm is still in development.
+ * The results it produces may still change in future versions.
+ *
+ * Results produced by v0.7.x are not comparable with results from v0.7.y.
+ * However, the API is completely stable, and it can safely be used for
+ * ephemeral data (local sessions).
+ *
+ * Avoid storing values in long-term storage until the algorithm is finalized.
+ *
+ * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if
+ * everything remains fine, its current format will be "frozen" and become the
+ * final one.
+ *
+ * After which, return values of XXH3 and XXH128 will no longer change in
+ * future versions.
+ *
+ * XXH3's return values will be officially finalized upon reaching v0.8.0.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional
+ * collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid trivial sequences, such as repeating sequences and especially '\0',
+ * as this can cancel out itself.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly based on the default
+ * secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+  /* used to store a custom secret generated from the seed. Makes state larger.
+   * Design might change */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   /* note: there is some padding after due to alignment on 64 bytes */
+   const unsigned char* secret;
+};   /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever possible.
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with the default parameters.
+ * The result will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, and must outlive the hash streaming session, so
+ * be careful when using stack arrays.
+ * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         <0 if *h128_1  < *h128_2
+ *         =0 if *h128_1 == *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be found in xxhash.c.
+ *
+ * However, code inlining requires the implementation to be visible to the
+ * compiler, usually within the header.
+ *
+ * As a workaround, xxhash.c used to be included within xxhash.h. This caused
+ * some issues with some build systems, especially ones which treat .c files
+ * as source files.
+ *
+ * Therefore, the implementation is now directly integrated within xxhash.h.
+ * Another small advantage is that xxhash.c is no longer needed in /include.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!
+ * XXH_FORCE_MEMORY_ACCESS:
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow to select a different access method for improved
+ * performance.
+ * Method 0 (default):
+ *     Use `memcpy()`. Safe and portable.
+ * Method 1:
+ *     `__attribute__((packed))` statement. It depends on compiler extensions
+ *     and is therefore not portable.
+ *     This method is safe if your compiler supports it, and *generally* as
+ *     fast or faster than `memcpy`.
+ * Method 2:
+ *     Direct access via cast. This method doesn't depend on the compiler but
+ *     violates the C standard.
+ *     It can generate buggy code on targets which do not support unaligned
+ *     memory accesses.
+ *     But in some circumstances, it's the only known way to get the most
+ *     performance (ie GCC + ARMv6)
+ * Method 3:
+ *     Byteshift. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction.
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2 > 3)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!
+ *XXH_ACCEPT_NULL_INPUT_POINTER:
+ * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+ * triggering a segfault.
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!
+ * XXH_FORCE_ALIGN_CHECK:
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means: check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * Set it to 0 when the input is guaranteed to be aligned or when alignment
+ * doesn't matter for performance.
+ *
+ * This option does not affect XXH3.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!
+ * XXH_NO_INLINE_HINTS:
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+/*!
+ * XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang.
+ */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*!
+ * Modify the local functions below should you wish to use some other memory
+ * routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free(void* p) { free(p); }
+
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS /* disable inlining hints */
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#elif defined(_MSC_VER)    /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) \
+    || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*
+ * DEBUGLEVEL is expected to be defined externally, typically via the compiler's
+ * command line options. The value must be a number.
+ */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/*!
+ * XXH_CPU_LITTLE_ENDIAN:
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Nonstandard, but well-defined behavior in practice.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \
+                               && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+     * loop (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize.
+     */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 do {                           \
+    h32 += (*ptr++) * PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1;      \
+} while (0)
+
+#define PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*!
+ * XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+ * performance gain on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+ * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+ * to unroll. The code becomes ridiculously large (the largest function in the
+ * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+ * also slightly faster because it fits into cache better and is more likely
+ * to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+ */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 do {                                   \
+    h64 ^= (*ptr++) * PRIME64_5;                           \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;                 \
+} while (0)
+
+#define PROCESS4_64 do {                                   \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1;      \
+    ptr += 4;                                              \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;     \
+} while (0)
+
+#define PROCESS8_64 do {                                   \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr += 8;                                              \
+    h64 ^= k1;                                             \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;     \
+} while (0)
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
-- 
cgit v1.2.3


From e335a8ca7615c702cfa2dcdb71deb69468088fd8 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Sun, 14 Jun 2020 21:04:25 +0200
Subject: first steps in bringing over the JIT refactor/fastmem

---
 src/ARM.cpp                         |  43 +-
 src/ARM.h                           |  15 +-
 src/ARMJIT.cpp                      | 771 ++++++++++-----------------------
 src/ARMJIT.h                        |  64 +--
 src/ARMJIT_A64/ARMJIT_ALU.cpp       | 123 +++++-
 src/ARMJIT_A64/ARMJIT_Branch.cpp    |  99 ++---
 src/ARMJIT_A64/ARMJIT_Compiler.cpp  | 383 ++++++++++++-----
 src/ARMJIT_A64/ARMJIT_Compiler.h    |  71 +++-
 src/ARMJIT_A64/ARMJIT_Linkage.s     |  68 +++
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 790 ++++++++++++++++------------------
 src/ARMJIT_Compiler.h               |  12 +
 src/ARMJIT_Internal.h               |  70 +--
 src/ARMJIT_Memory.cpp               | 822 ++++++++++++++++++++++++++++++++++++
 src/ARMJIT_Memory.h                 |  53 +++
 src/ARMJIT_x64/ARMJIT_Compiler.cpp  |  92 +---
 src/ARMJIT_x64/ARMJIT_Compiler.h    |  11 +-
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp |  45 +-
 src/ARM_InstrInfo.cpp               |  73 ++--
 src/ARM_InstrInfo.h                 |   1 +
 src/CMakeLists.txt                  |   6 +-
 src/CP15.cpp                        |  84 ++--
 src/Config.cpp                      |   6 +-
 src/Config.h                        |   1 +
 src/NDS.cpp                         | 220 +++++-----
 src/NDS.h                           |  17 +-
 25 files changed, 2342 insertions(+), 1598 deletions(-)
 create mode 100644 src/ARMJIT_A64/ARMJIT_Linkage.s
 create mode 100644 src/ARMJIT_Compiler.h
 create mode 100644 src/ARMJIT_Memory.cpp
 create mode 100644 src/ARMJIT_Memory.h

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index 92a3a9e..e529be8 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,6 +21,8 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMInterpreter.h"
+#include "ARMJIT.h"
+#include "Config.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
 #include "Config.h"
@@ -74,7 +76,9 @@ ARM::~ARM()
 
 ARMv5::ARMv5() : ARM(0)
 {
-    //
+#ifndef JIT_ENABLED
+    DTCM = new u8[DTCMSize];
+#endif
 }
 
 ARMv4::ARMv4() : ARM(1)
@@ -82,6 +86,13 @@ ARMv4::ARMv4() : ARM(1)
     //
 }
 
+ARMv5::~ARMv5()
+{
+#ifndef JIT_ENABLED
+    delete[] DTCM;
+#endif
+}
+
 void ARM::Reset()
 {
     Cycles = 0;
@@ -622,24 +633,26 @@ void ARMv5::ExecuteJIT()
     while (NDS::ARM9Timestamp < NDS::ARM9Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr);
-        if (!translatedAddr)
+
+        // hack so Cycles <= 0 becomes Cycles < 0
+        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
         {
             NDS::ARM9Timestamp = NDS::ARM9Target;
             printf("ARMv5 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        // hack so Cycles <= 0 becomes Cycles < 0
-        Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1;
-
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr);
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1);
+        NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1;
 
         if (StopExecution)
         {
@@ -766,23 +779,25 @@ void ARMv4::ExecuteJIT()
     while (NDS::ARM7Timestamp < NDS::ARM7Target)
     {
         u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-        u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr);
-        if (!translatedAddr)
+
+        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
+
+        if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+            && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
         {
             NDS::ARM7Timestamp = NDS::ARM7Target;
             printf("ARMv4 PC in non executable region %08X\n", R[15]);
             return;
         }
 
-        Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1;
-
-        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr);
+        ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, 
+            instrAddr - FastBlockLookupStart, instrAddr);
         if (block)
             ARM_Dispatch(this, block);
         else
             ARMJIT::CompileBlock(this);
 
-        NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1);
+        NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1;
 
         // TODO optimize this shit!!!
         if (StopExecution)
diff --git a/src/ARM.h b/src/ARM.h
index b1e8053..b7f16d6 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -32,11 +32,14 @@ enum
     RWFlags_ForceUser = (1<<21),
 };
 
+const u32 ITCMPhysicalSize = 0x8000;
+const u32 DTCMPhysicalSize = 0x4000;
+
 class ARM
 {
 public:
     ARM(u32 num);
-    ~ARM(); // destroy shit
+    virtual ~ARM(); // destroy shit
 
     virtual void Reset();
 
@@ -143,6 +146,11 @@ public:
 
     NDS::MemRegion CodeMem;
 
+#ifdef JIT_ENABLED
+    u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0;
+    u64* FastBlockLookup;
+#endif
+
     static u32 ConditionTable[16];
 
 protected:
@@ -158,6 +166,7 @@ class ARMv5 : public ARM
 {
 public:
     ARMv5();
+    ~ARMv5();
 
     void Reset();
 
@@ -260,8 +269,8 @@ public:
     u32 DTCMBase, DTCMSize;
     s32 RegionCodeCycles;
 
-    u8 ITCM[0x8000];
-    u8 DTCM[0x4000];
+    u8 ITCM[ITCMPhysicalSize];
+    u8* DTCM;
 
     u8 ICache[0x2000];
     u32 ICacheTags[64*4];
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 8d87c76..53b28c1 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -10,13 +10,8 @@
 #include "Config.h"
 
 #include "ARMJIT_Internal.h"
-#if defined(__x86_64__)
-#include "ARMJIT_x64/ARMJIT_Compiler.h"
-#elif defined(__aarch64__)
-#include "ARMJIT_A64/ARMJIT_Compiler.h"
-#else
-#error "The current target platform doesn't have a JIT backend"
-#endif
+#include "ARMJIT_Memory.h"
+#include "ARMJIT_Compiler.h"
 
 #include "ARMInterpreter_ALU.h"
 #include "ARMInterpreter_LoadStore.h"
@@ -29,6 +24,11 @@
 #include "Wifi.h"
 #include "NDSCart.h"
 
+#include "ARMJIT_x64/ARMJIT_Offsets.h"
+static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset);
+static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset);
+static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset);
+
 namespace ARMJIT
 {
 
@@ -37,281 +37,100 @@ namespace ARMJIT
 
 Compiler* JITCompiler;
 
-const u32 ExeMemRegionSizes[] =
-{
-	0x8000,			// Unmapped Region (dummy)
-	0x8000, 		// ITCM
-	4*1024*1024, 	// Main RAM
-	0x8000, 		// SWRAM
-	0xA4000, 		// LCDC
-	0x8000, 		// ARM9 BIOS
-	0x4000, 		// ARM7 BIOS
-	0x10000,		// ARM7 WRAM
-	0x40000			// ARM7 WVRAM
-};
-
-const u32 ExeMemRegionOffsets[] =
-{
-	0,
-	0x8000,
-	0x10000,
-	0x410000,
-	0x418000,
-	0x4BC000,
-	0x4C4000,
-	0x4C8000,
-	0x4D8000,
-	0x518000,
-};
-
-/*
-	translates address to pseudo physical address
-		- more compact, eliminates mirroring, everything comes in a row
-		- we only need one translation table
-*/
-
-u32 TranslateAddr9(u32 addr)
-{
-	switch (ClassifyAddress9(addr))
-	{
-	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
-	case memregion_SWRAM9:
-		if (NDS::SWRAM_ARM9)
-			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask);
-		else
-			return 0;
-	case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF);
-	case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0;
-	case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF);
-	default: return 0;
-	}
-}
-
-u32 TranslateAddr7(u32 addr)
-{
-	switch (ClassifyAddress7(addr))
-	{
-	case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1));
-	case memregion_SWRAM7:
-		if (NDS::SWRAM_ARM7)
-			return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask);
-		else
-			return 0;
-	case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr;
-	case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF);
-	case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF);
-	default: return 0;
-	}
-}
-
-AddressRange CodeRanges[ExeMemSpaceSize / 512];
-
-TinyVector<u32> InvalidLiterals;
+AddressRange CodeIndexITCM[ITCMPhysicalSize / 512];
+AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512];
+AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512];
+AddressRange CodeIndexVRAM[0x100000 / 512];
+AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512];
+AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512];
+AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512];
+AddressRange CodeIndexARM7WVRAM[0x40000 / 512];
 
 std::unordered_map<u32, JitBlock*> JitBlocks9;
 std::unordered_map<u32, JitBlock*> JitBlocks7;
 
-u8 MemoryStatus9[0x800000];
-u8 MemoryStatus7[0x800000];
+u64 FastBlockLookupITCM[ITCMPhysicalSize / 2];
+u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2];
+u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2];
+u64 FastBlockLookupVRAM[0x100000 / 2];
+u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2];
+u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2];
+u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2];
+u64 FastBlockLookupARM7WVRAM[0x40000 / 2];
 
-int ClassifyAddress9(u32 addr)
+const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 {
-	if (addr < NDS::ARM9->ITCMSize)
-		return memregion_ITCM;
-	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
-		return memregion_DTCM;
-	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
-		return memregion_BIOS9;
-	else
-	{
-		switch (addr & 0xFF000000)
-		{
-		case 0x02000000:
-			return memregion_MainRAM;
-		case 0x03000000:
-			return memregion_SWRAM9;
-		case 0x04000000:
-			return memregion_IO9;
-		case 0x06000000:
-			return memregion_VRAM;
-		}
-	}
-	return memregion_Other;
-}
+	0,
+	ITCMPhysicalSize,
+	0,
+	sizeof(NDS::ARM9BIOS),
+	NDS::MainRAMSize,
+	NDS::SharedWRAMSize,
+	0,
+	0x100000,
+	sizeof(NDS::ARM7BIOS),
+	NDS::ARM7WRAMSize,
+	0,
+	0,
+	0x40000,
+};
 
-int ClassifyAddress7(u32 addr)
+AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
 {
-	if (addr < 0x00004000)
-		return memregion_BIOS7;
-	else
-	{
-		switch (addr & 0xFF800000)
-		{
-		case 0x02000000:
-		case 0x02800000:
-			return memregion_MainRAM;
-		case 0x03000000:
-			if (NDS::SWRAM_ARM7)
-				return memregion_SWRAM7;
-			else
-				return memregion_WRAM7;
-		case 0x03800000:
-			return memregion_WRAM7;
-		case 0x04000000:
-			return memregion_IO7;
-		case 0x04800000:
-			return memregion_Wifi;
-		case 0x06000000:
-		case 0x06800000:
-			return memregion_VWRAM;
-		}
-	}
-	return memregion_Other;
-}
+	NULL,
+	CodeIndexITCM,
+	NULL,
+	CodeIndexARM9BIOS,
+	CodeIndexMainRAM,
+	CodeIndexSWRAM,
+	NULL,
+	CodeIndexVRAM,
+	CodeIndexARM7BIOS,
+	CodeIndexARM7WRAM,
+	NULL,
+	NULL,
+	CodeIndexARM7WVRAM,
+};
 
-void UpdateMemoryStatus9(u32 start, u32 end)
+u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
 {
-	start >>= 12;
-	end >>= 12;
-
-	if (end == 0xFFFFF)
-		end++;
-
-	for (u32 i = start; i < end; i++)
-	{
-		u32 addr = i << 12;
-
-		int region = ClassifyAddress9(addr);
-		u32 pseudoPhyisical = TranslateAddr9(addr);
-
-		for (u32 j = 0; j < 8; j++)
-		{
-			u8 val = region;
-			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
-				val |= 0x80;
-			MemoryStatus9[i * 8 + j] = val;
-		}
-	}
-}
+	NULL,
+	FastBlockLookupITCM,
+	NULL,
+	FastBlockLookupARM9BIOS,
+	FastBlockLookupMainRAM,
+	FastBlockLookupSWRAM,
+	NULL,
+	FastBlockLookupVRAM,
+	FastBlockLookupARM7BIOS,
+	FastBlockLookupARM7WRAM,
+	NULL,
+	NULL,
+	FastBlockLookupARM7WVRAM
+};
 
-void UpdateMemoryStatus7(u32 start, u32 end)
+u32 LocaliseCodeAddress(u32 num, u32 addr)
 {
-	start >>= 12;
-	end >>= 12;
-
-	if (end == 0xFFFFF)
-		end++;
-
-	for (u32 i = start; i < end; i++)
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(addr)
+		: ARMJIT_Memory::ClassifyAddress7(addr);
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
+		mappingSize, memoryOffset, memorySize)
+		&& CodeMemRegions[region])
 	{
-		u32 addr = i << 12;
-
-		int region = ClassifyAddress7(addr);
-		u32 pseudoPhyisical = TranslateAddr7(addr);
-
-		for (u32 j = 0; j < 8; j++)
-		{
-			u8 val = region;
-			if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length)
-				val |= 0x80;
-			MemoryStatus7[i * 8 + j] = val;
-		}
+		addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
+		addr |= (u32)region << 28;
+		return addr;
 	}
+	return 0;
 }
 
-void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate)
-{
-	for (u32 i = 1; i < exeMem_Count; i++)
-	{
-		if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i])
-		{
-			for (u32 num = 0; num < 2; num++)
-			{
-				u32 physSize = ExeMemRegionSizes[i];
-				u32 mapSize = 0;
-				u32 mapStart = 0;
-				switch (i)
-				{
-				case exeMem_ITCM:
-					if (num == 0)
-						mapStart = 0; mapSize = NDS::ARM9->ITCMSize;
-					break;
-				case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break;
-				case exeMem_SWRAM:
-					if (num == 0)
-					{
-						if (NDS::SWRAM_ARM9)
-							mapStart = 0x3000000, mapSize = 0x1000000;
-						else
-							mapStart = mapSize = 0;
-					}
-					else
-					{
-						if (NDS::SWRAM_ARM7)
-							mapStart = 0x3000000, mapSize = 0x800000;
-						else
-							mapStart = mapSize = 0;
-					}
-					break;
-				case exeMem_LCDC:
-					if (num == 0)
-						mapStart = 0x6800000, mapSize = 0xA4000;
-					break;
-				case exeMem_ARM9_BIOS:
-					if (num == 0)
-						mapStart = 0xFFFF0000, mapSize = 0x10000;
-					break;
-				case exeMem_ARM7_BIOS:
-					if (num == 1)
-						mapStart = 0; mapSize = 0x4000;
-					break;
-				case exeMem_ARM7_WRAM:
-					if (num == 1)
-					{
-						if (NDS::SWRAM_ARM7)
-							mapStart = 0x3800000, mapSize = 0x800000;
-						else
-							mapStart = 0x3000000, mapSize = 0x1000000;
-					}
-					break;
-				case exeMem_ARM7_WVRAM:
-					if (num == 1)
-						mapStart = 0x6000000, mapSize = 0x1000000;
-					break;
-				}
-
-				for (u32 j = 0; j < mapSize / physSize; j++)
-				{
-					u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]);
-					if (num == 0
-						&& virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
-						continue;
-					if (invalidate)
-					{
-						if (num == 0)
-							MemoryStatus9[virtAddr / 512] |= 0x80;
-						else
-							MemoryStatus7[virtAddr / 512] |= 0x80;
-					}
-					else
-					{
-						if (num == 0)
-							MemoryStatus9[virtAddr / 512] &= ~0x80;
-						else
-							MemoryStatus7[virtAddr / 512] &= ~0x80;
-					}
-				}
-				
-			}
-			return;
-		}
-	}
-
-	assert(false);
-}
+TinyVector<u32> InvalidLiterals;
 
 template <typename T>
-T SlowRead9(ARMv5* cpu, u32 addr)
+T SlowRead9(u32 addr, ARMv5* cpu)
 {
 	u32 offset = addr & 0x3;
 	addr &= ~(sizeof(T) - 1);
@@ -335,13 +154,13 @@ T SlowRead9(ARMv5* cpu, u32 addr)
 }
 
 template <typename T>
-void SlowWrite9(ARMv5* cpu, u32 addr, T val)
+void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 {
 	addr &= ~(sizeof(T) - 1);
 
     if (addr < cpu->ITCMSize)
 	{
-		InvalidateITCMIfNecessary(addr);
+        CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 		*(T*)&cpu->ITCM[addr & 0x7FFF] = val;
 	}
 	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
@@ -362,13 +181,13 @@ void SlowWrite9(ARMv5* cpu, u32 addr, T val)
 	}
 }
 
-template void SlowWrite9<u32>(ARMv5*, u32, u32);
-template void SlowWrite9<u16>(ARMv5*, u32, u16);
-template void SlowWrite9<u8>(ARMv5*, u32, u8);
+template void SlowWrite9<u32>(u32, ARMv5*, u32);
+template void SlowWrite9<u16>(u32, ARMv5*, u16);
+template void SlowWrite9<u8>(u32, ARMv5*, u8);
 
-template u32 SlowRead9<u32>(ARMv5*, u32);
-template u16 SlowRead9<u16>(ARMv5*, u32);
-template u8 SlowRead9<u8>(ARMv5*, u32);
+template u32 SlowRead9<u32>(u32, ARMv5*);
+template u16 SlowRead9<u16>(u32, ARMv5*);
+template u8 SlowRead9<u8>(u32, ARMv5*);
 
 template <typename T>
 T SlowRead7(u32 addr)
@@ -407,14 +226,15 @@ template <bool PreInc, bool Write>
 void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
 {
 	addr &= ~0x3;
+	if (PreInc)
+		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
-		addr += PreInc * 4;
 		if (Write)
-			SlowWrite9<u32>(cpu, addr, data[i]);
+			SlowWrite9<u32>(addr, cpu, data[i]);
 		else
-			data[i] = SlowRead9<u32>(cpu, addr);
-		addr += !PreInc * 4;
+			data[i] = SlowRead9<u32>(addr, cpu);
+		addr += 4;
 	}
 }
 
@@ -422,14 +242,15 @@ template <bool PreInc, bool Write>
 void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
 {
 	addr &= ~0x3;
+	if (PreInc)
+		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
-		addr += PreInc * 4;
 		if (Write)
 			SlowWrite7<u32>(addr, data[i]);
 		else
 			data[i] = SlowRead7<u32>(addr);
-		addr += !PreInc * 4;
+		addr += 4;
 	}
 }
 
@@ -540,16 +361,18 @@ struct UnreliableHashTable
 };
 
 UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates;
-UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9;
-UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7;
 
 void Init()
 {
 	JITCompiler = new Compiler();
+
+	ARMJIT_Memory::Init();
 }
 
 void DeInit()
 {
+	ARMJIT_Memory::DeInit();
+
 	delete JITCompiler;
 }
 
@@ -557,8 +380,7 @@ void Reset()
 {
 	ResetBlockCache();
 
-	UpdateMemoryStatus9(0, 0xFFFFFFFF);
-	UpdateMemoryStatus7(0, 0xFFFFFFFF);
+	ARMJIT_Memory::Reset();
 }
 
 void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags)
@@ -673,11 +495,12 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount)
 	// it basically checks if one iteration of a loop depends on another
 	// the rules are quite simple
 
+	JIT_DEBUGPRINT("checking potential idle loop\n");
 	u16 regsWrittenTo = 0;
 	u16 regsDisallowedToWrite = 0;
 	for (int i = 0; i < instrsCount; i++)
 	{
-		//printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
+		JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite);
 		if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem)
 			return false;
 		if (i < instrsCount - 1 && instrs[i].Info.Branches())
@@ -782,8 +605,6 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] =
 };
 #undef F
 
-
-extern u32 literalsPerBlock;
 void CompileBlock(ARM* cpu)
 {
     bool thumb = cpu->CPSR & 0x20;
@@ -794,14 +615,28 @@ void CompileBlock(ARM* cpu)
 		Config::JIT_MaxBlockSize = 32;
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
-	u32 pseudoPhysicalAddr = cpu->Num == 0
-			? TranslateAddr9(blockAddr)
-			: TranslateAddr7(blockAddr);
-    if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped])
-    {
-        printf("Trying to compile a block in unmapped memory: %x\n", blockAddr);
-    }
-	
+
+	auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7;
+	auto existingBlockIt = map.find(blockAddr);
+	if (existingBlockIt != map.end())
+	{
+		// there's already a block, though it's not inside the fast map
+		// could be that there are two blocks at the same physical addr
+		// but different mirrors
+		u32 localAddr = existingBlockIt->second->StartAddrLocal;
+
+		u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF];
+		*entry = ((u64)blockAddr | cpu->Num) << 32;
+		*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
+		return;
+	}
+
+	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
+	if (!localAddr)
+	{
+		printf("trying to compile non executable code? %x\n", blockAddr);
+	}
+
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
     int i = 0;
     u32 r15 = cpu->R[15];
@@ -842,9 +677,8 @@ void CompileBlock(ARM* cpu)
 
 		instrValues[i] = instrs[i].Instr;
 
-		u32 translatedAddr = cpu->Num == 0
-			? TranslateAddr9(instrs[i].Addr)
-			: TranslateAddr7(instrs[i].Addr);
+		u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr);
+		assert(translatedAddr);
 		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
@@ -928,9 +762,11 @@ void CompileBlock(ARM* cpu)
 			&& instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral
 			&& DecodeLiteral(thumb, instrs[i], literalAddr))
 		{
-			u32 translatedAddr = cpu->Num == 0
-				? TranslateAddr9(literalAddr)
-				: TranslateAddr7(literalAddr);
+			u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr);
+			if (!translatedAddr)
+			{
+				printf("literal in non executable memory?\n");
+			}
 			u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 
 			u32 j = 0;
@@ -994,9 +830,7 @@ void CompileBlock(ARM* cpu)
 				}
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
-					u32 targetPseudoPhysical = cpu->Num == 0
-						? TranslateAddr9(target)
-						: TranslateAddr7(target);
+					u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target);
 
 					if (link)
 					{
@@ -1048,7 +882,7 @@ void CompileBlock(ARM* cpu)
 	{
 		RestoreCandidates.Remove(instrHash);
 
-		mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash;
+		mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash;
 
 		if (mayRestore && prevBlock->NumAddresses == numAddressRanges)
 		{
@@ -1087,11 +921,12 @@ void CompileBlock(ARM* cpu)
 		for (int j = 0; j < numLiterals; j++)
 			block->Literals()[j] = literalLoadAddrs[j];
 
-		block->PseudoPhysicalAddr = pseudoPhysicalAddr;
+		block->StartAddr = blockAddr;
+		block->StartAddrLocal = localAddr;
 
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
-		block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i);
+		block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i);
 	}
 	else
 	{
@@ -1104,30 +939,34 @@ void CompileBlock(ARM* cpu)
 		assert(addressRanges[j] == block->AddressRanges()[j]);
 		assert(addressMasks[j] == block->AddressMasks()[j]);
 		assert(addressMasks[j] != 0);
-		CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j];
-		CodeRanges[addressRanges[j] / 512].Blocks.Add(block);
 
-		UpdateRegionByPseudoPhyiscal(addressRanges[j], true);
+		AddressRange* region = CodeMemRegions[addressRanges[j] >> 28];
+
+		if (!PageContainsCode(&region[(addressRanges[j] & 0xFFFF000) / 512]))
+			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true);
+
+		AddressRange* range = &region[(addressRanges[j] & 0xFFFFFFF) / 512];
+		range->Code |= addressMasks[j];
+		range->Blocks.Add(block);
 	}
 
 	if (cpu->Num == 0)
-	{
-		JitBlocks9[pseudoPhysicalAddr] = block;
-		FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
-	}
+		JitBlocks9[blockAddr] = block;
 	else
-	{
-		JitBlocks7[pseudoPhysicalAddr] = block;
-		FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint));
-	}
+		JitBlocks7[blockAddr] = block;
+
+	u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2];
+	*entry = ((u64)blockAddr | cpu->Num) << 32;
+	*entry |= JITCompiler->SubEntryOffset(block->EntryPoint);
 }
 
-void InvalidateByAddr(u32 pseudoPhysical)
+void InvalidateByAddr(u32 localAddr)
 {
-	JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical);
+	JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr);
 
-	AddressRange* range = &CodeRanges[pseudoPhysical / 512];
-	u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16);
+	AddressRange* region = CodeMemRegions[localAddr >> 28];
+	AddressRange* range = &region[(localAddr & 0xFFFFFFF) / 512];
+	u32 mask = 1 << ((localAddr & 0x1FF) / 16);
 
 	range->Code = 0;
 	for (int i = 0; i < range->Blocks.Length;)
@@ -1138,7 +977,7 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		u32 mask = 0;
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
-			if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF))
+			if (block->AddressRanges()[j] == (localAddr & ~0x1FF))
 			{
 				mask = block->AddressMasks()[j];
 				invalidated = block->AddressMasks()[j] & mask;
@@ -1154,15 +993,21 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		}
 		range->Blocks.Remove(i);
 
+		if (range->Blocks.Length == 0
+			&& !PageContainsCode(&region[(localAddr & 0xFFFF000) / 512]))
+		{
+			ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false);
+		}
+
 		bool literalInvalidation = false;
 		for (int j = 0; j < block->NumLiterals; j++)
 		{
 			u32 addr = block->Literals()[j];
-			if (addr == pseudoPhysical)
+			if (addr == localAddr)
 			{
-				if (InvalidLiterals.Find(pseudoPhysical) != -1)
+				if (InvalidLiterals.Find(localAddr) != -1)
 				{
-					InvalidLiterals.Add(pseudoPhysical);
+					InvalidLiterals.Add(localAddr);
 					JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length);
 				}
 				literalInvalidation = true;
@@ -1172,35 +1017,30 @@ void InvalidateByAddr(u32 pseudoPhysical)
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			if ((addr / 512) != (pseudoPhysical / 512))
+			if ((addr / 512) != (localAddr / 512))
 			{
-				AddressRange* otherRange = &CodeRanges[addr / 512];
+				AddressRange* otherRegion = CodeMemRegions[addr >> 28];
+				AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512];
 				assert(otherRange != range);
+
 				bool removed = otherRange->Blocks.RemoveByValue(block);
 				assert(removed);
 
 				if (otherRange->Blocks.Length == 0)
 				{
+					if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512]))
+						ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false);
+
 					otherRange->Code = 0;
-					UpdateRegionByPseudoPhyiscal(addr, false);
 				}
 			}
 		}
 
-		for (int j = 0; j < block->NumLinks(); j++)
-			JITCompiler->UnlinkBlock(block->Links()[j]);
-		block->ResetLinks();
-
+		FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32;
 		if (block->Num == 0)
-		{
-			JitBlocks9.erase(block->PseudoPhysicalAddr);
-			FastBlockLookUp9.Remove(block->PseudoPhysicalAddr);
-		}
+			JitBlocks9.erase(block->StartAddr);
 		else
-		{
-			JitBlocks7.erase(block->PseudoPhysicalAddr);
-			FastBlockLookUp7.Remove(block->PseudoPhysicalAddr);
-		}
+			JitBlocks7.erase(block->StartAddr);
 
 		if (!literalInvalidation)
 		{
@@ -1213,24 +1053,66 @@ void InvalidateByAddr(u32 pseudoPhysical)
 			delete block;
 		}
 	}
+}
 
-	if (range->Blocks.Length == 0)
-		UpdateRegionByPseudoPhyiscal(pseudoPhysical, false);
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr)
+{
+	// let's hope this gets all properly inlined
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize))
+	{
+		u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
+		if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
+			InvalidateByAddr(localAddr | (region << 28));
+	}
+}
+
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
+{
+	u64* entry = &entries[offset / 2];
+	if (*entry >> 32 == (addr | num))
+		return JITCompiler->AddEntryOffset((u32)*entry);
+	return NULL;
 }
 
-void InvalidateRegionIfNecessary(u32 pseudoPhyisical)
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size)
 {
-	if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16)))
-		InvalidateByAddr(pseudoPhyisical);
+	int region = num == 0
+		? ARMJIT_Memory::ClassifyAddress9(blockAddr)
+		: ARMJIT_Memory::ClassifyAddress7(blockAddr);
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	if (CodeMemRegions[region]
+		&& ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
+			mappingSize, memoryOffset, memorySize))
+	{
+		entry = FastBlockLookupRegions[region] + memoryOffset / 2;
+		// evil, though it should work for everything except DTCM which is not relevant here
+		start = blockAddr & ~(memorySize - 1);
+		size = memorySize;
+		return true;
+	}
+	else
+		return false;
 }
 
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32);
+
 void ResetBlockCache()
 {
 	printf("Resetting JIT block cache...\n");
 
 	InvalidLiterals.Clear();
-	FastBlockLookUp9.Reset();
-	FastBlockLookUp7.Reset();
+	for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++)
+		memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2);
 	RestoreCandidates.Reset();
 	for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++)
 	{
@@ -1251,8 +1133,9 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].Code = 0;
+			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
 		}
 		delete block;
 	}
@@ -1262,8 +1145,9 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			CodeRanges[addr / 512].Blocks.Clear();
-			CodeRanges[addr / 512].Code = 0;
+			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			range->Blocks.Clear();
+			range->Code = 0;
 		}
 	}
 	JitBlocks9.clear();
@@ -1272,191 +1156,4 @@ void ResetBlockCache()
 	JITCompiler->Reset();
 }
 
-template <u32 Num>
-JitBlockEntry LookUpBlockEntry(u32 addr)
-{
-	auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7;
-	u32 entryOffset = fastMap.LookUp(addr);
-	if (entryOffset != UINT32_MAX)
-		return JITCompiler->AddEntryOffset(entryOffset);
-
-	auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7;
-	auto block = slowMap.find(addr);
-	if (block != slowMap.end())
-	{
-		fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint));
-		return block->second->EntryPoint;
-	}
-	return NULL;
-}
-
-template JitBlockEntry LookUpBlockEntry<0>(u32);
-template JitBlockEntry LookUpBlockEntry<1>(u32);
-
-template <u32 Num>
-void LinkBlock(ARM* cpu, u32 codeOffset)
-{
-	auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7;
-	u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4);
-	u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr);
-	auto block = blockMap.find(targetPseudoPhys);
-	if (block == blockMap.end())
-	{
-		CompileBlock(cpu);
-		block = blockMap.find(targetPseudoPhys);
-	}
-
-	JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys);
-
-	block->second->AddLink(codeOffset);
-	JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint);
-}
-
-template void LinkBlock<0>(ARM*, u32);
-template void LinkBlock<1>(ARM*, u32);
-
-void WifiWrite32(u32 addr, u32 val)
-{
-	Wifi::Write(addr, val & 0xFFFF);
-	Wifi::Write(addr + 2, val >> 16);
-}
-
-u32 WifiRead32(u32 addr)
-{
-	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
-}
-
-template <typename T>
-void VRAMWrite(u32 addr, T val)
-{
-	switch (addr & 0x00E00000)
-	{
-	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
-	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
-	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
-	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
-	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
-	}
-}
-template <typename T>
-T VRAMRead(u32 addr)
-{
-	switch (addr & 0x00E00000)
-	{
-	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
-	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
-	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
-	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
-	default: return GPU::ReadVRAM_LCDC<T>(addr);
-	}
-}
-
-void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
-{
-	if (cpu->Num == 0)
-	{
-		switch (addr & 0xFF000000)
-		{
-		case 0x04000000:
-			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
-				return (void*)NDSCart::ReadROMData;
-
-			/*
-				unfortunately we can't map GPU2D this way
-				since it's hidden inside an object
-
-				though GPU3D registers are accessed much more intensive
-			*/
-			if (addr >= 0x04000320 && addr < 0x040006A4)
-			{
-				switch (size | store)
-				{
-				case 8: return (void*)GPU3D::Read8;		
-				case 9: return (void*)GPU3D::Write8;		
-				case 16: return (void*)GPU3D::Read16;
-				case 17: return (void*)GPU3D::Write16;
-				case 32: return (void*)GPU3D::Read32;
-				case 33: return (void*)GPU3D::Write32;
-				}
-			}
-
-			switch (size | store)
-			{
-			case 8: return (void*)NDS::ARM9IORead8;
-			case 9: return (void*)NDS::ARM9IOWrite8;
-			case 16: return (void*)NDS::ARM9IORead16;
-			case 17: return (void*)NDS::ARM9IOWrite16;
-			case 32: return (void*)NDS::ARM9IORead32;
-			case 33: return (void*)NDS::ARM9IOWrite32;
-			}
-			break;
-		case 0x06000000:
-			switch (size | store)
-			{
-			case 8: return (void*)VRAMRead<u8>;		
-			case 9: return NULL;
-			case 16: return (void*)VRAMRead<u16>;
-			case 17: return (void*)VRAMWrite<u16>;
-			case 32: return (void*)VRAMRead<u32>;
-			case 33: return (void*)VRAMWrite<u32>;
-			}
-			break;
-		}
-	}
-	else
-	{
-		switch (addr & 0xFF800000)
-		{
-		case 0x04000000:
-			if (addr >= 0x04000400 && addr < 0x04000520)
-			{
-				switch (size | store)
-				{
-				case 8: return (void*)SPU::Read8;		
-				case 9: return (void*)SPU::Write8;		
-				case 16: return (void*)SPU::Read16;
-				case 17: return (void*)SPU::Write16;
-				case 32: return (void*)SPU::Read32;
-				case 33: return (void*)SPU::Write32;
-				}
-			}
-
-			switch (size | store)
-			{
-			case 8: return (void*)NDS::ARM7IORead8;
-			case 9: return (void*)NDS::ARM7IOWrite8;		
-			case 16: return (void*)NDS::ARM7IORead16;
-			case 17: return (void*)NDS::ARM7IOWrite16;
-			case 32: return (void*)NDS::ARM7IORead32;
-			case 33: return (void*)NDS::ARM7IOWrite32;
-			}
-			break;
-		case 0x04800000:
-			if (addr < 0x04810000 && size >= 16)
-			{
-				switch (size | store)
-				{
-				case 16: return (void*)Wifi::Read;
-				case 17: return (void*)Wifi::Write;
-				case 32: return (void*)WifiRead32;
-				case 33: return (void*)WifiWrite32;
-				}
-			}
-			break;
-		case 0x06000000:
-		case 0x06800000:
-			switch (size | store)
-			{
-			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
-			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
-			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
-			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
-			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
-			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
-			}
-		}
-	}
-	return NULL;
-}
-
 }
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 44a6140..2320b7b 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -9,32 +9,7 @@
 namespace ARMJIT
 {
 
-enum ExeMemKind
-{
-	exeMem_Unmapped = 0,
-	exeMem_ITCM,
-	exeMem_MainRAM,
-	exeMem_SWRAM,
-	exeMem_LCDC,
-	exeMem_ARM9_BIOS,
-	exeMem_ARM7_BIOS,
-	exeMem_ARM7_WRAM,
-	exeMem_ARM7_WVRAM,
-	exeMem_Count
-};
-
-extern const u32 ExeMemRegionOffsets[];
-extern const u32 ExeMemRegionSizes[];
-
-typedef u32 (*JitBlockEntry)();
-
-const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
-
-u32 TranslateAddr9(u32 addr);
-u32 TranslateAddr7(u32 addr);
-
-template <u32 Num>
-JitBlockEntry LookUpBlockEntry(u32 addr);
+typedef void (*JitBlockEntry)();
 
 void Init();
 void DeInit();
@@ -43,44 +18,15 @@ void Reset();
 
 void InvalidateByAddr(u32 pseudoPhysical);
 
-void InvalidateRegionIfNecessary(u32 addr);
-
-inline void InvalidateMainRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)));
-}
-inline void InvalidateITCMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF));
-}
-inline void InvalidateLCDCIfNecessary(u32 addr)
-{
-	if (addr < 0x68A3FFF)
-		InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000));
-}
-inline void InvalidateSWRAM7IfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask));
-}
-inline void InvalidateSWRAM9IfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask));
-}
-inline void InvalidateARM7WRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF));
-}
-inline void InvalidateARM7WVRAMIfNecessary(u32 addr)
-{
-	InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF));
-}
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr);
 
 void CompileBlock(ARM* cpu);
 
 void ResetBlockCache();
 
-void UpdateMemoryStatus9(u32 start, u32 end);
-void UpdateMemoryStatus7(u32 start, u32 end);
+JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr);
+bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size);
 
 }
 
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
index 0fe6a97..5f021a0 100644
--- a/src/ARMJIT_A64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
     if (S && !CurInstr.SetFlags)
         S = false;
 
-    bool CVInGP = false;
+    bool CVInGPR = false;
     switch (op)
     {
     case 0x2: // SUB
@@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
         UBFX(W2, RCPSR, 29, 1);
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, rn, W2);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
             ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, W2, W1);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
         MVN(W1, rn);
         if (S)
         {
-            CVInGP = true;
+            CVInGPR = true;
             ADDS(W1, W2, W1);
             CSET(W2, CC_CS);
             CSET(W3, CC_VS);
@@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
 
     if (S)
     {
-        if (CVInGP)
+        if (CVInGPR)
         {
             BFI(RCPSR, W2, 29, 1);
             BFI(RCPSR, W3, 28, 1);
         }
-        Comp_RetriveFlags(!CVInGP);
+        Comp_RetriveFlags(!CVInGPR);
     }
 }
 
@@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp()
             MOVI2R(rd, op2.Imm);
         }
         else
-            MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+        {
+            // ORR with shifted operand has cycles latency
+            if (op2.Reg.ShiftAmount > 0)
+            {
+                switch (op2.Reg.ShiftType)
+                {
+                case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+                }
+            }
+            else
+            {
+                MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+            }
+        }
     }
 
     if (S)
@@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg
     }
     else
     {
-        CLZ(W0, rs);
-        CLS(W1, rs);
-        CMP(W0, W1);
-        CSEL(W0, W0, W1, CC_GT);
+        CLS(W0, rs);
         Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
     }
 
@@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long()
     }
     else
     {
-        CLZ(W0, rs);
-        CLS(W1, rs);
-        CMP(W0, W1);
-        CSEL(W0, W0, W1, CC_GT);
+        if (sign)
+            CLS(W0, rs);
+        else
+            CLZ(W0, rs);
         Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
     }
 
@@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long()
         Comp_RetriveFlags(false);
 }
 
+void Compiler::A_Comp_Mul_Short()
+{
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+    ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+    ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+    u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+    bool x = CurInstr.Instr & (1 << 5);
+    bool y = CurInstr.Instr & (1 << 6);
+
+    SBFX(W1, rs, y ? 16 : 0, 16);
+
+    if (op == 0b1000)
+    {
+        // SMLAxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(W0, W0, W1);
+
+        ORRI2R(W1, RCPSR, 0x08000000);
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+        ADDS(rd, W0, rn);
+
+        CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+        CPSRDirty = true;
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1011)
+    {
+        // SMULxy
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        MUL(rd, W0, W1);
+
+        Comp_AddCycles_C();
+    }
+    else if (op == 0b1010)
+    {
+        // SMLALxy
+
+        ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+        MOV(W2, rn);
+        BFI(X2, rd, 32, 32);
+
+        SBFX(W0, rm, x ? 16 : 0, 16);
+
+        SMADDL(EncodeRegTo64(rn), W0, W1, X2);
+
+        UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+
+        Comp_AddCycles_CI(1);
+    }
+    else if (op == 0b1001)
+    {
+        // SMLAWy/SMULWy
+        SMULL(X0, rm, W1);
+        ASR(x ? EncodeRegTo64(rd) : X0, X0, 16);
+
+        if (!x)
+        {
+            ORRI2R(W1, RCPSR, 0x08000000);
+
+            ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+            ADDS(rd, W0, rn);
+
+            CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+            CPSRDirty = true;
+        }
+
+        Comp_AddCycles_C();
+    }
+}
+
 void Compiler::A_Comp_Mul()
 {
     ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
index 542f0b7..f130938 100644
--- a/src/ARMJIT_A64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
     if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 
@@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind)
     AlignCode16();
     void* res = GetRXPtr();
 
-    MOVI2R(W2, kCodeCacheTiming);
-    // W1 - code cycles non branch
-    // W2 - branch code cycles
     LSR(W1, W0, 12);
-    LSL(W1, W1, 2);
     ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
     LDRB(W1, RCPU, W1);
 
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+    LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize));
 
     STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
 
-    CMP(W0, W3);
-    FixupBranch outsideITCM = B(CC_LO);
-    MOVI2R(W1, 1);
-    MOVI2R(W2, 1);
-    SetJumpTarget(outsideITCM);
+    CMP(W1, 0xFF);
+    MOVI2R(W3, kCodeCacheTiming);
+    CSEL(W1, W3, W1, CC_EQ);
+    CMP(W0, W2);
+    CSINC(W1, W1, WZR, CC_HS);
 
     FixupBranch switchToThumb;
     if (kind == 0)
@@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind)
 
     if (kind == 0 || kind == 1)
     {
-        ANDI2R(W0, W0, ~3);
-
+        // ARM
         if (kind == 0)
             ANDI2R(RCPSR, RCPSR, ~0x20);
 
-        ADD(W3, W0, 4);
-        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
-        ADD(W1, W1, W2);
-        ADD(RCycles, RCycles, W1);
+        ANDI2R(W0, W0, ~3);
+        ADD(W0, W0, 4);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
 
+        ADD(W1, W1, W1);
+        SUB(RCycles, RCycles, W1);
         RET();
     }
+
     if (kind == 0 || kind == 2)
     {
+        // Thumb
         if (kind == 0)
         {
             SetJumpTarget(switchToThumb);
-
             ORRI2R(RCPSR, RCPSR, 0x20);
         }
 
         ANDI2R(W0, W0, ~1);
+        ADD(W0, W0, 2);
+        STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
 
-        ADD(W3, W0, 2);
-        STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
-        FixupBranch halfwordLoc = TBZ(W0, 1);
-        ADD(W1, W1, W2);
-        ADD(RCycles, RCycles, W1);
-        RET();
-
-        SetJumpTarget(halfwordLoc);
-        ADD(RCycles, RCycles, W2);
+        ADD(W2, W1, W1);
+        TSTI2R(W0, 0x2);
+        CSEL(W1, W1, W2, CC_EQ);
+        SUB(RCycles, RCycles, W1);
         RET();
     }
 
@@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind)
         UBFX(W2, W3, 0, 8);
         UBFX(W3, W3, 8, 8);
         ADD(W2, W3, W2);
-        ADD(RCycles, RCycles, W2);
+        SUB(RCycles, RCycles, W2);
 
         ANDI2R(W0, W0, ~3);
 
@@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind)
         UBFX(W2, W3, 16, 8);
         UBFX(W3, W3, 24, 8);
         ADD(W2, W3, W2);
-        ADD(RCycles, RCycles, W2);
+        SUB(RCycles, RCycles, W2);
 
         ANDI2R(W0, W0, ~1);
 
@@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
     }
     else
     {
-        BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
-        bool previouslyDirty = CPSRDirty;
+        
+        bool cpsrDirty = CPSRDirty;
         SaveCPSR();
-
-        if (restoreCPSR)
-        {
-            if (Thumb || CurInstr.Cond() >= 0xE)
-                RegCache.Flush();
-            else
-            {
-                // the ugly way...
-                // we only save them, to load and save them again
-                for (int reg : hiRegsLoaded)
-                    SaveReg(reg, RegCache.Mapping[reg]);
-            }
-        }
+        SaveCycles();
+        PushRegs(restoreCPSR);
 
         if (switchThumb)
             MOV(W1, addr);
@@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
             QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
         else
             QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
-        
-        if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
-        {
-            for (int reg : hiRegsLoaded)
-                LoadReg(reg, RegCache.Mapping[reg]);
-        }
 
-        if (previouslyDirty)
-            LoadCPSR();
-        CPSRDirty = previouslyDirty;
+        PopRegs(restoreCPSR);
+        LoadCycles();
+        LoadCPSR();
+        if (CurInstr.Cond() < 0xE)
+            CPSRDirty = cpsrDirty;
     }
 }
 
@@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND()
     s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
     Comp_JumpTo(R15 + offset + 1, true);
 
-    Comp_BranchSpecialBehaviour();
+    Comp_BranchSpecialBehaviour(true);
 
     FixupBranch skipFailed = B();
     SetJumpTarget(skipExecute);
     Comp_AddCycles_C(true);
 
-    if (CurInstr.BranchFlags & branch_FollowCondTaken)
-    {
-        SaveCPSR(false);
-        RegCache.PrepareExit();
-        
-        ADD(W0, RCycles, ConstantCycles);
-        ABI_PopRegisters(SavedRegs);
-        RET();
-    }
+    Comp_BranchSpecialBehaviour(false);
 
     SetJumpTarget(skipFailed);
 }
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index a67f357..42435ed 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -1,9 +1,3 @@
-#include "ARMJIT_Compiler.h"
-
-#include "../ARMInterpreter.h"
-
-#include "../ARMJIT_Internal.h"
-
 #ifdef __SWITCH__
 #include "../switch/compat_switch.h"
 
@@ -13,10 +7,17 @@ extern char __start__;
 #include <unistd.h>
 #endif
 
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMInterpreter.h"
+#include "../Config.h"
+
 #include <malloc.h>
 
 using namespace Arm64Gen;
 
+extern "C" void ARM_Ret();
 
 namespace ARMJIT
 {
@@ -28,7 +29,10 @@ namespace ARMJIT
     like x64. At one hand you can translate a lot of instructions directly.
     But at the same time, there are a ton of exceptions, like for
     example ADD and SUB can't have a RORed second operand on ARMv8.
- */
+ 
+    While writing a JIT when an instruction is recompiled into multiple ones
+    not to write back until you've read all the other operands!
+*/
 
 template <>
 const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
@@ -46,6 +50,132 @@ void Compiler::MovePC()
     ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
 }
 
+void Compiler::A_Comp_MRS()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+        MOV(rd, W3);
+    }
+    else
+        MOV(rd, RCPSR);
+}
+
+void Compiler::A_Comp_MSR()
+{
+    Comp_AddCycles_C();
+
+    ARM64Reg val;
+    if (CurInstr.Instr & (1 << 25))
+    {
+        val = W0;
+        MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)));
+    }
+    else
+    {
+        val = MapReg(CurInstr.A_Reg(0));
+    }
+
+    u32 mask = 0;
+    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+    if (CurInstr.Instr & (1 << 22))
+    {
+        ANDI2R(W5, RCPSR, 0x1F);
+        MOVI2R(W3, 0);
+        MOVI2R(W1, 15 - 8);
+        BL(ReadBanked);
+
+        MOVI2R(W1, mask);
+        MOVI2R(W2, mask & 0xFFFFFF00);
+        ANDI2R(W5, RCPSR, 0x1F);
+        CMP(W5, 0x10);
+        CSEL(W1, W2, W1, CC_EQ);
+
+        BIC(W3, W3, W1);
+        AND(W0, val, W1);
+        ORR(W3, W3, W0);
+
+        MOVI2R(W1, 15 - 8);
+
+        BL(WriteBanked);
+    }
+    else
+    {
+        mask &= 0xFFFFFFDF;
+        CPSRDirty = true;
+
+        if ((mask & 0xFF) == 0)
+        {
+            ANDI2R(RCPSR, RCPSR, ~mask);
+            ANDI2R(W0, val, mask);
+            ORR(RCPSR, RCPSR, W0);
+        }
+        else
+        {
+            MOVI2R(W2, mask);
+            MOVI2R(W3, mask & 0xFFFFFF00);
+            ANDI2R(W1, RCPSR, 0x1F);
+            // W1 = first argument
+            CMP(W1, 0x10);
+            CSEL(W2, W3, W2, CC_EQ);
+
+            BIC(RCPSR, RCPSR, W2);
+            AND(W0, val, W2);
+            ORR(RCPSR, RCPSR, W0);
+
+            MOV(W2, RCPSR);
+            MOV(X0, RCPU);
+
+            PushRegs(true);
+
+            QuickCallFunction(X3, (void*)&ARM::UpdateMode);
+        
+            PopRegs(true);
+        }
+    }
+}
+
+void Compiler::PushRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        if (Thumb || CurInstr.Cond() == 0xE)
+        {
+            BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsLoaded)
+                RegCache.UnloadRegister(reg);
+        }
+        else
+        {
+            BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00);
+            for (int reg : hiRegsDirty)
+                SaveReg(reg, RegCache.Mapping[reg]);
+        }
+    }
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+    if (saveHiRegs)
+    {
+        BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+
+        for (int reg : hiRegsLoaded)
+            LoadReg(reg, RegCache.Mapping[reg]);
+    }
+}
+
 Compiler::Compiler()
 {
 #ifdef __SWITCH__
@@ -80,8 +210,7 @@ Compiler::Compiler()
     assert(succeded);
 
     SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
-    JitMemUseableSize = JitMemSize;
-    Reset();
+    JitMemMainSize = JitMemSize;
 #else
     u64 pageSize = sysconf(_SC_PAGE_SIZE);
     u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
@@ -90,31 +219,8 @@ Compiler::Compiler()
 
     SetCodeBase(pageAligned, pageAligned);
     JitMemUseableSize = alignedSize;
-    Reset();
 #endif
-
-    for (int i = 0; i < 3; i++)
-    {
-        for (int j = 0; j < 2; j++)
-        {
-            MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
-        }
-    }
-    MemFunc7[0][0] = (void*)NDS::ARM7Read8;
-    MemFunc7[1][0] = (void*)NDS::ARM7Read16;
-    MemFunc7[2][0] = (void*)NDS::ARM7Read32;
-    MemFunc7[0][1] = (void*)NDS::ARM7Write8;
-    MemFunc7[1][1] = (void*)NDS::ARM7Write16;
-    MemFunc7[2][1] = (void*)NDS::ARM7Write32;
-
-    for (int i = 0; i < 2; i++)
-    {
-        for (int j = 0; j < 2; j++)
-        {
-            MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
-            MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
-        }
-    }
+    SetCodePtr(0);
 
     for (int i = 0; i < 3; i++)
     {
@@ -123,26 +229,26 @@ Compiler::Compiler()
     }
 
     /*
-        W0 - mode
+        W5 - mode
         W1 - reg num
         W3 - in/out value of reg
     */
     {
         ReadBanked = GetRXPtr();
 
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
-        CMP(W0, 0x11);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
         FixupBranch fiq = B(CC_EQ);
         SUBS(W1, W1, 13 - 8);
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
         FixupBranch notEverything = B(CC_LT);
-        CMP(W0, 0x12);
+        CMP(W5, 0x12);
         FixupBranch irq = B(CC_EQ);
-        CMP(W0, 0x13);
+        CMP(W5, 0x13);
         FixupBranch svc = B(CC_EQ);
-        CMP(W0, 0x17);
+        CMP(W5, 0x17);
         FixupBranch abt = B(CC_EQ);
-        CMP(W0, 0x1B);
+        CMP(W5, 0x1B);
         FixupBranch und = B(CC_EQ);
         SetJumpTarget(notEverything);
         RET();
@@ -166,19 +272,19 @@ Compiler::Compiler()
     {
         WriteBanked = GetRXPtr();
 
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
-        CMP(W0, 0x11);
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+        CMP(W5, 0x11);
         FixupBranch fiq = B(CC_EQ);
         SUBS(W1, W1, 13 - 8);
-        ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+        ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
         FixupBranch notEverything = B(CC_LT);
-        CMP(W0, 0x12);
+        CMP(W5, 0x12);
         FixupBranch irq = B(CC_EQ);
-        CMP(W0, 0x13);
+        CMP(W5, 0x13);
         FixupBranch svc = B(CC_EQ);
-        CMP(W0, 0x17);
+        CMP(W5, 0x17);
         FixupBranch abt = B(CC_EQ);
-        CMP(W0, 0x1B);
+        CMP(W5, 0x1B);
         FixupBranch und = B(CC_EQ);
         SetJumpTarget(notEverything);
         MOVI2R(W4, 0);
@@ -206,9 +312,71 @@ Compiler::Compiler()
         RET();
     }
 
-    //FlushIcache();
+    for (int num = 0; num < 2; num++)
+    {
+        for (int size = 0; size < 3; size++)
+        {
+            for (int reg = 0; reg < 8; reg++)
+            {
+                ARM64Reg rdMapped = (ARM64Reg)(W19 + reg);
+                PatchedStoreFuncs[num][size][reg] = GetRXPtr();
+                if (num == 0)
+                {
+                    MOV(X1, RCPU);
+                    MOV(W2, rdMapped);
+                }
+                else
+                {
+                    MOV(W1, rdMapped);
+                }
+                ABI_PushRegisters({30});
+                switch ((8 << size) |  num)
+                {
+                case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                case 33: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                case 17: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                case 9: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                }
+                ABI_PopRegisters({30});
+                RET();
+
+                for (int signextend = 0; signextend < 2; signextend++)
+                {
+                    PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr();
+                    if (num == 0)
+                        MOV(X1, RCPU);
+                    ABI_PushRegisters({30});
+                    switch ((8 << size) |  num)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 33: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 17: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    case 9: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                    ABI_PopRegisters({30});
+                    if (size == 32)
+                        MOV(rdMapped, W0);
+                    else if (signextend)
+                        SBFX(rdMapped, W0, 0, 8 << size);
+                    else
+                        UBFX(rdMapped, W0, 0, 8 << size);
+                    RET();
+                }
+            }
+        }
+    }
+
+    FlushIcache();
+
+    JitMemSecondarySize = 1024*1024*4;
+
+    JitMemMainSize -= GetCodeOffset();
+    JitMemMainSize -= JitMemSecondarySize;
 
-    JitMemUseableSize -= GetCodeOffset();
     SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
 }
 
@@ -227,6 +395,16 @@ Compiler::~Compiler()
 #endif
 }
 
+void Compiler::LoadCycles()
+{
+    LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
+void Compiler::SaveCycles()
+{
+    STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
 void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
 {
     if (reg == 15)
@@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // CMN
     F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
     // Mul
-    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, 
+    F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short),
     // ARMv5 exclusives
     F(Clz), NULL, NULL, NULL, NULL, 
     
@@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
     // Branch
     F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
     // Special
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL,
     &Compiler::Nop
 };
 #undef F
@@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind)
     return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
 }
 
-void Compiler::Comp_BranchSpecialBehaviour()
+void Compiler::Comp_BranchSpecialBehaviour(bool taken)
 {
-    if (CurInstr.BranchFlags & branch_IdleBranch)
+    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
     {
         MOVI2R(W0, 1);
         STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
     }
 
-    if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
     {
-        SaveCPSR(false);
         RegCache.PrepareExit();
-        ADD(W0, RCycles, ConstantCycles);
-        ABI_PopRegisters(SavedRegs);
-        RET();
+
+        SUB(RCycles, RCycles, ConstantCycles);
+        QuickTailCall(X0, ARM_Ret);
     }
 }
 
 JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
-    if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+    if (JitMemMainSize - GetCodeOffset() < 1024 * 16)
+    {
+        printf("JIT near memory full, resetting...\n");
+        ResetBlockCache();
+    }
+    if ((JitMemMainSize +  JitMemSecondarySize) - OtherCodeRegion < 1024 * 8)
     {
-        printf("JIT memory full, resetting...\n");
+        printf("JIT far memory full, resetting...\n");
         ResetBlockCache();
     }
 
@@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
     CurCPU = cpu;
     ConstantCycles = 0;
     RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
-
-    //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
-    const u32 ALL_CALLEE_SAVED = 0x7FF80000;
-
-    SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
-
-    //if (Num == 1)
-    {
-        ABI_PushRegisters(SavedRegs);
-
-        MOVP2R(RCPU, CurCPU);
-        MOVI2R(RCycles, 0);
-
-        LoadCPSR();
-    }
+    CPSRDirty = false;
 
     for (int i = 0; i < instrsCount; i++)
     {
@@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
 
         if (comp == NULL)
         {
+            SaveCycles();
             SaveCPSR();
             RegCache.Flush();
         }
@@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
                     (this->*comp)();
                 }
 
-                Comp_BranchSpecialBehaviour();
+                Comp_BranchSpecialBehaviour(true);
 
                 if (cond < 0xE)
                 {
-                    if (IrregularCycles)
+                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                     {
                         FixupBranch skipNop = B();
                         SetJumpTarget(skipExecute);
 
                         Comp_AddCycles_C();
 
-                        if (CurInstr.BranchFlags & branch_FollowCondTaken)
-                        {
-                            SaveCPSR(false);
-                            RegCache.PrepareExit();
-                            ADD(W0, RCycles, ConstantCycles);
-                            ABI_PopRegisters(SavedRegs);
-                            RET();
-                        }
+                        Comp_BranchSpecialBehaviour(false);
 
                         SetJumpTarget(skipNop);
                     }
@@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
         }
 
         if (comp == NULL)
+        {
+            LoadCycles();
             LoadCPSR();
+        }
     }
 
     RegCache.Flush();
 
-    //if (Num == 1)
-    {
-        SaveCPSR();
-
-        ADD(W0, RCycles, ConstantCycles);
-
-        ABI_PopRegisters(SavedRegs);
-    }
-    //else
-    //    ADD(RCycles, RCycles, ConstantCycles);
-
-    RET();
+    SUB(RCycles, RCycles, ConstantCycles);
+    QuickTailCall(X0, ARM_Ret);
 
     FlushIcache();
 
-    //printf("finished\n");
-
     return res;
 }
 
 void Compiler::Reset()
 {
+    LoadStorePatches.clear();
+
     SetCodePtr(0);
+    OtherCodeRegion = JitMemMainSize;
 
     const u32 brk_0 = 0xD4200000;
 
-    for (int i = 0; i < JitMemUseableSize / 4; i++)
+    for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++)
         *(((u32*)GetRWPtr()) + i) = brk_0;
 }
 
-void Compiler::Comp_AddCycles_C(bool nonConst)
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
 
-    if (!nonConst && !CurInstr.Info.Branches())
+    if (forceNonConstant)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CI(u32 numI)
 {
+    IrregularCycles = true;
+
     s32 cycles = (Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
 
-    if (Thumb || CurInstr.Cond() >= 0xE)
+    if (Thumb || CurInstr.Cond() == 0xE)
         ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
 {
+    IrregularCycles = true;
+
     s32 cycles = (Num ?
         NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
         : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
 
-    ADD(RCycles, RCycles, numI, shift);
+    SUB(RCycles, RCycles, cycles);
     if (Thumb || CurInstr.Cond() >= 0xE)
-        ConstantCycles += c;
+        ConstantCycles += cycles;
     else
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
 }
 
 void Compiler::Comp_AddCycles_CDI()
@@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI()
         }
         
         if (!Thumb && CurInstr.Cond() < 0xE)
-            ADD(RCycles, RCycles, cycles);
+            SUB(RCycles, RCycles, cycles);
         else
             ConstantCycles += cycles;
     }
@@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD()
     }
 
     if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
-        ADD(RCycles, RCycles, cycles);
+        SUB(RCycles, RCycles, cycles);
     else
         ConstantCycles += cycles;
 }
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 5c9ef41..e4ffc63 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -9,6 +9,8 @@
 #include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
+#include <unordered_map>
+
 namespace ARMJIT
 {
 
@@ -64,7 +66,14 @@ struct Op2
     };
 };
 
-class Compiler : Arm64Gen::ARM64XEmitter
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s32 PatchOffset;
+    u32 PatchSize;
+};
+
+class Compiler : public Arm64Gen::ARM64XEmitter
 {
 public:
     typedef void (Compiler::*CompileFunc)();
@@ -72,6 +81,9 @@ public:
     Compiler();
     ~Compiler();
 
+    void PushRegs(bool saveHiRegs);
+    void PopRegs(bool saveHiRegs);
+
     Arm64Gen::ARM64Reg MapReg(int reg)
     {
         assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
@@ -89,7 +101,7 @@ public:
 
     void Reset();
 
-    void Comp_AddCycles_C(bool forceNonConst = false);
+    void Comp_AddCycles_C(bool forceNonConstant = false);
     void Comp_AddCycles_CI(u32 numI);
     void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
     void Comp_AddCycles_CD();
@@ -103,6 +115,9 @@ public:
     void LoadCPSR();
     void SaveCPSR(bool markClean = true);
 
+    void LoadCycles();
+    void SaveCycles();
+
     void Nop() {}
 
     void A_Comp_ALUTriOp();
@@ -111,6 +126,7 @@ public:
 
     void A_Comp_Mul();
     void A_Comp_Mul_Long();
+    void A_Comp_Mul_Short();
 
     void A_Comp_Clz();
 
@@ -122,6 +138,8 @@ public:
     void A_Comp_BranchImm();
     void A_Comp_BranchXchangeReg();
 
+    void A_Comp_MRS();
+    void A_Comp_MSR();
 
     void T_Comp_ShiftImm();
     void T_Comp_AddSub_();
@@ -168,7 +186,7 @@ public:
     void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
     void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
 
-    void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+    bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
     enum
     {
         memop_Writeback = 1 << 0,
@@ -179,16 +197,33 @@ public:
     };
     void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
 
-    void* Gen_MemoryRoutine9(int size, bool store);
-
-    void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
-    void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
-
     // 0 = switch mode, 1 = stay arm, 2 = stay thumb
     void* Gen_JumpTo9(int kind);
     void* Gen_JumpTo7(int kind);
 
-    void Comp_BranchSpecialBehaviour();
+    void Comp_BranchSpecialBehaviour(bool taken);
+
+    JitBlockEntry AddEntryOffset(u32 offset)
+    {
+        return (JitBlockEntry)(GetRXBase() + offset);
+    }
+
+    u32 SubEntryOffset(JitBlockEntry entry)
+    {
+        return (u8*)entry - GetRXBase();
+    }
+
+    bool IsJITFault(u64 pc);
+    s64 RewriteMemAccess(u64 pc);
+
+    void SwapCodeRegion()
+    {
+        ptrdiff_t offset = GetCodeOffset();
+        SetCodePtrUnsafe(OtherCodeRegion);
+        OtherCodeRegion = offset;
+    }
+
+    ptrdiff_t OtherCodeRegion;
 
     bool Exit;
 
@@ -202,22 +237,20 @@ public:
 
     BitSet32 SavedRegs;
 
-    u32 JitMemUseableSize;
+    u32 JitMemSecondarySize;
+    u32 JitMemMainSize;
 
     void* ReadBanked, *WriteBanked;
 
-    // [size][store]
-    void* MemFunc9[3][2];
-    void* MemFunc7[3][2];
-
-    // [store][pre increment]
-    void* MemFuncsSeq9[2][2];
-    // "[code in main ram]
-    void* MemFuncsSeq7[2][2];
-
     void* JumpToFuncs9[3];
     void* JumpToFuncs7[3];
 
+    std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches; 
+
+    // [Num][Size][Sign Extend][Output register]
+    void* PatchedLoadFuncs[2][3][2][8];
+    void* PatchedStoreFuncs[2][3][8];
+
     RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
 
     bool CPSRDirty = false;
diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..536a478
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Linkage.s
@@ -0,0 +1,68 @@
+#include "../ARMJIT_x64/ARMJIT_Offsets.h"
+
+.text
+
+#define RCPSR W27
+#define RCycles W28
+#define RCPU X29
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+    stp x19, x20, [sp, #-96]!
+    stp x21, x22, [sp, #16]
+    stp x23, x24, [sp, #32]
+    stp x25, x26, [sp, #48]
+    stp x27, x28, [sp, #64]
+    stp x29, x30, [sp, #80]
+
+    mov RCPU, x0
+    ldr RCycles, [RCPU, ARM_Cycles_offset]
+    ldr RCPSR, [RCPU, ARM_CPSR_offset]
+
+    br x1
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+    str RCycles, [RCPU, ARM_Cycles_offset]
+    str RCPSR, [RCPU, ARM_CPSR_offset]
+
+    ldp x29, x30, [sp, #80]
+    ldp x27, x28, [sp, #64]
+    ldp x25, x26, [sp, #48]
+    ldp x23, x24, [sp, #32]
+    ldp x21, x22, [sp, #16]
+    ldp x19, x20, [sp], #96
+
+    ret
+
+.p2align 4,,15
+
+.global ARM_RestoreContext
+ARM_RestoreContext:
+    mov sp, x0
+
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
+    ldp x8, x9, [sp, #64]
+    ldp x10, x11, [sp, #80]
+    ldp x12, x13, [sp, #96]
+    ldp x14, x15, [sp, #112]
+    ldp x16, x17, [sp, #128]
+    ldp x18, x19, [sp, #144]
+    ldp x20, x21, [sp, #160]
+    ldp x22, x23, [sp, #176]
+    ldp x24, x25, [sp, #192]
+    ldp x26, x27, [sp, #208]
+    ldp x28, x29, [sp, #224]
+    ldr x30, [sp, #240]
+
+    ldp x17, x18, [sp, #248]
+    mov sp, x17
+
+    br x18
\ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index 6cf710b..b307d0e 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -2,286 +2,62 @@
 
 #include "../Config.h"
 
+#include "../ARMJIT_Memory.h"
+
 using namespace Arm64Gen;
 
 namespace ARMJIT
 {
 
-// W0 - address
-// (if store) W1 - value to store
-// W2 - code cycles
-void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+bool Compiler::IsJITFault(u64 pc)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
-
-    u32 addressMask;
-    switch (size)
-    {
-    case 32: addressMask = ~3; break;
-    case 16: addressMask = ~1; break;
-    case 8:  addressMask = ~0; break;
-    }
-
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
-    SUB(W3, W0, W3);
-    CMP(W3, W4);
-    FixupBranch insideDTCM = B(CC_LO);
-
-    UBFX(W4, W0, 24, 8);
-    CMP(W4, 0x02);
-    FixupBranch outsideMainRAM = B(CC_NEQ);
-    ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
-    MOVP2R(X4, NDS::MainRAM);
-    if (!store && size == 32)
-    {
-        LDR(W3, X3, X4);
-        ANDI2R(W0, W0, 3);
-        LSL(W0, W0, 3);
-        RORV(W0, W3, W0);
-    }
-    else if (store)
-        STRGeneric(size, W1, X3, X4);
-    else
-        LDRGeneric(size, false, W0, X3, X4);
-    RET();
-
-    SetJumpTarget(outsideMainRAM);
-
-    LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
-    CMP(W0, W3);
-    FixupBranch insideITCM = B(CC_LO);
-
-    if (store)
-    {
-        if (size > 8)
-            ANDI2R(W0, W0, addressMask);
-
-        switch (size)
-        {
-        case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
-        case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
-        case 8:  QuickTailCall(X4, NDS::ARM9Write8);  break;
-        }
-    }
-    else
-    {
-        if (size == 32)
-            ABI_PushRegisters({0, 30});
-        if (size > 8)
-            ANDI2R(W0, W0, addressMask);
-
-        switch (size)
-        {
-        case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
-        case 16: QuickTailCall    (X4, NDS::ARM9Read16); break;
-        case 8:  QuickTailCall    (X4, NDS::ARM9Read8 ); break;
-        }
-        if (size == 32)
-        {
-            ABI_PopRegisters({1, 30});
-            ANDI2R(W1, W1, 3);
-            LSL(W1, W1, 3);
-            RORV(W0, W0, W1);
-            RET();
-        }
-    }
-
-    SetJumpTarget(insideDTCM);
-    ANDI2R(W3, W3, 0x3FFF & addressMask);
-    ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
-    if (!store && size == 32)
-    {
-        ANDI2R(W4, W0, 3);
-        LDR(W0, RCPU, W3);
-        LSL(W4, W4, 3);
-        RORV(W0, W0, W4);
-    }
-    else if (store)
-        STRGeneric(size, W1, RCPU, W3);
-    else
-        LDRGeneric(size, false, W0, RCPU, W3);
-    
-    RET();
-
-    SetJumpTarget(insideITCM);
-    ANDI2R(W3, W0, 0x7FFF & addressMask);
-    if (store)
-    {
-        ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4);
-        LSR(W5, W0, 9);
-        MOVP2R(X4, CodeRanges);
-        ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4));
-        static_assert(sizeof(AddressRange) == 16);
-        LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
-        FixupBranch null = CBZ(W4);
-        ABI_PushRegisters({1, 3, 30});
-        QuickCallFunction(X4, InvalidateByAddr);
-        ABI_PopRegisters({1, 3, 30});
-        SetJumpTarget(null);
-    }
-    ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
-    if (!store && size == 32)
-    {
-        ANDI2R(W4, W0, 3);
-        LDR(W0, RCPU, W3);
-        LSL(W4, W4, 3);
-        RORV(W0, W0, W4);
-    }
-    else if (store)
-        STRGeneric(size, W1, RCPU, W3);
-    else
-        LDRGeneric(size, false, W0, RCPU, W3);
-    RET();
-
-    return res;
+    return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize);
 }
 
-/*
-    W0 - base address
-    X1 - stack space
-    W2 - values count
-*/
-void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+s64 Compiler::RewriteMemAccess(u64 pc)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
-    
-    void* loopStart = GetRXPtr();
-    SUB(W2, W2, 1);
-
-    if (preinc)
-        ADD(W0, W0, 4);
+    ptrdiff_t pcOffset = pc - (u64)GetRXBase();
 
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
-    LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
-    SUB(W4, W0, W4);
-    CMP(W4, W5);
-    FixupBranch insideDTCM = B(CC_LO);
+    auto it = LoadStorePatches.find(pcOffset);
 
-    LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
-    CMP(W0, W4);
-    FixupBranch insideITCM = B(CC_LO);
-
-    ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
-    if (store)
+    if (it != LoadStorePatches.end())
     {
-        LDR(X1, X1, ArithOption(X2, true));
-        QuickCallFunction(X4, NDS::ARM9Write32);
+        LoadStorePatch patch = it->second;
 
-        ABI_PopRegisters({0, 1, 2, 30});
-    }
-    else
-    {
-        QuickCallFunction(X4, NDS::ARM9Read32);
-        MOV(W4, W0);
+        ptrdiff_t curCodeOffset = GetCodeOffset();
 
-        ABI_PopRegisters({0, 1, 2, 30});
+        SetCodePtrUnsafe(pcOffset + patch.PatchOffset);
 
-        STR(X4, X1, ArithOption(X2, true));
-    }
+        BL(patch.PatchFunc);
 
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
+        for (int i = 0; i < patch.PatchSize / 4 - 1; i++)
+            HINT(HINT_NOP);
 
-    SetJumpTarget(insideDTCM);
+        FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr());
 
-    ANDI2R(W4, W4, ~3 & 0x3FFF);
-    ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X4);
-    }
-    else
-    {
-        LDR(W5, RCPU, X4);
-        STR(X5, X1, ArithOption(X2, true));
-    }
+        SetCodePtrUnsafe(curCodeOffset);
 
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
-
-    SetJumpTarget(insideITCM);
-
-    ANDI2R(W4, W0, ~3 & 0x7FFF);
-
-    ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5);
-    if (store)
-    {
-        LDR(X5, X1, ArithOption(X2, true));
-        STR(W5, RCPU, X6);
-    }
-    else
-    {
-        LDR(W5, RCPU, X6);
-        STR(X5, X1, ArithOption(X2, true));
-    }
+        LoadStorePatches.erase(it);
 
-    if (store)
-    {
-        ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5);
-        LSR(W6, W4, 9);
-        MOVP2R(X5, CodeRanges);
-        ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
-        static_assert(sizeof(AddressRange) == 16);
-        LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
-        FixupBranch null = CBZ(W5);
-        ABI_PushRegisters({0, 1, 2, 4, 30});
-        MOV(W0, W4);
-        QuickCallFunction(X5, InvalidateByAddr);
-        ABI_PopRegisters({0, 1, 2, 4, 30});
-        SetJumpTarget(null);
+        return patch.PatchOffset;
     }
-
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
-    return res;
+    printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc));
+    assert(false);
 }
 
-void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
 {
-    AlignCode16();
-    void* res = GetRXPtr();
+    u32 localAddr = LocaliseCodeAddress(Num, addr);
 
-    void* loopStart = GetRXPtr();
-    SUB(W2, W2, 1);
-
-    if (preinc)
-        ADD(W0, W0, 4);
-
-    ABI_PushRegisters({0, 1, 2, 30});
-    if (store)
+    int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
+    if (invalidLiteralIdx != -1)
     {
-        LDR(X1, X1, ArithOption(X2, true));
-        QuickCallFunction(X4, NDS::ARM7Write32);
-        ABI_PopRegisters({0, 1, 2, 30});
+        InvalidLiterals.Remove(invalidLiteralIdx);
+        return false;
     }
-    else
-    {
-        QuickCallFunction(X4, NDS::ARM7Read32);
-        MOV(W4, W0);
-        ABI_PopRegisters({0, 1, 2, 30});
-        STR(X4, X1, ArithOption(X2, true));
-    }
-
-    if (!preinc)
-        ADD(W0, W0, 4);
-    CBNZ(W2, loopStart);
-    RET();
 
-    return res;
-}
+    Comp_AddCycles_CDI();
 
-void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
-{
     u32 val;
     // make sure arm7 bios is accessible
     u32 tmpR15 = CurCPU->R[15];
@@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
 
     if (Thumb || CurInstr.Cond() == 0xE)
         RegCache.PutLiteral(rd, val);
+    
+    return true;
 }
 
 void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
@@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
         addressMask = ~3;
     if (size == 16)
         addressMask = ~1;
+
+    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+    {
+        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+        
+        if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
+            return;
+    }
     
     if (flags & memop_Store)
         Comp_AddCycles_CD();
     else
         Comp_AddCycles_CDI();
 
-    if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
-    {
-        u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+    ARM64Reg rdMapped = MapReg(rd);
+    ARM64Reg rnMapped = MapReg(rn);
 
-        if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
-        {
-            Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
-            return;
-        }
+    if (Thumb && rn == 15)
+    {
+        ANDI2R(W3, rnMapped, ~2);
+        rnMapped = W3;
     }
 
+    ARM64Reg finalAddr = W0;
+    if (flags & memop_Post)
     {
-        ARM64Reg rdMapped = MapReg(rd);
-        ARM64Reg rnMapped = MapReg(rn);
-
-        bool inlinePreparation = Num == 1;
-        u32 constLocalROR32 = 4;
+        finalAddr = rnMapped;
+        MOV(W0, rnMapped);
+    }
 
-        void* memFunc = Num == 0
-            ? MemFunc9[size >> 4][!!(flags & memop_Store)]
-            : MemFunc7[size >> 4][!!((flags & memop_Store))];
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
 
-        if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+    if (!offset.IsImm)
+        Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+    // offset might has become an immediate
+    if (offset.IsImm)
+    {
+        if (offset.Imm)
+        {
+            if (flags & memop_SubtractOffset)
+                SUB(finalAddr, rnMapped, offset.Imm);
+            else
+                ADD(finalAddr, rnMapped, offset.Imm);
+        }
+        else if (finalAddr != rnMapped)
+            MOV(finalAddr, rnMapped);
+    }
+    else
+    {
+        if (offset.Reg.ShiftType == ST_ROR)
         {
-            u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+            ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+            offset = Op2(W0);
+        }
 
-            NDS::MemRegion region;
-            region.Mem = NULL;
-            if (Num == 0)
-            {
-                ARMv5* cpu5 = (ARMv5*)CurCPU;
+        if (flags & memop_SubtractOffset)
+            SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+        else
+            ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+    }
 
-                // stupid dtcm...
-                if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
-                {
-                    region.Mem = cpu5->DTCM;
-                    region.Mask = 0x3FFF;
-                }
-                else
-                {
-                    NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
-                }
-            }
-            else
-                NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+    if (!(flags & memop_Post) && (flags & memop_Writeback))
+        MOV(rnMapped, W0);
 
-            if (region.Mem != NULL)
-            {
-                void* ptr = &region.Mem[addr & addressMask & region.Mask];
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
 
-                MOVP2R(X0, ptr);
-                if (flags & memop_Store)
-                    STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
-                else
-                {
-                    LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
-                    if (size == 32 && addr & ~0x3)
-                        ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
-                }
-                return;
-            }
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget)))
+    {
+        ptrdiff_t memopStart = GetCodeOffset();
+        LoadStorePatch patch;
 
-            void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
-            if (specialFunc)
-            {
-                memFunc = specialFunc;
-                inlinePreparation = true;
-                constLocalROR32 = addr & 0x3;
-            }
-        }
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19]
+            : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19];
+        assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8);
 
-        ARM64Reg finalAddr = W0;
-        if (flags & memop_Post)
-        {
-            finalAddr = rnMapped;
-            MOV(W0, rnMapped);
-        }
+        MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
 
+        // take a chance at fastmem
+        if (size > 8)
+            ANDI2R(W1, W0, addressMask);
+        
+        ptrdiff_t loadStorePosition = GetCodeOffset();
         if (flags & memop_Store)
-            MOV(W1, rdMapped);
-
-        if (!offset.IsImm)
-            Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
-        // offset might become an immediate
-        if (offset.IsImm)
         {
-            if (flags & memop_SubtractOffset)
-                SUB(finalAddr, rnMapped, offset.Imm);
-            else
-                ADD(finalAddr, rnMapped, offset.Imm);
+            STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7);
         }
         else
         {
-            if (offset.Reg.ShiftType == ST_ROR)
+            LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7);
+            if (size == 32)
             {
-                ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
-                offset = Op2(W0);
+                UBFIZ(W0, W0, 3, 2);
+                RORV(rdMapped, rdMapped, W0);
             }
-
-            if (flags & memop_SubtractOffset)
-                SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
-            else
-                ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
         }
 
-        if (!(flags & memop_Post) && (flags & memop_Writeback))
-            MOV(rnMapped, W0);
+        patch.PatchOffset = memopStart - loadStorePosition;
+        patch.PatchSize = GetCodeOffset() - memopStart;
+        LoadStorePatches[loadStorePosition] = patch;
+    }
+    else
+    {
+        void* func = NULL;
+        if (addrIsStatic)
+            func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size);
 
-        if (inlinePreparation)
+        if (func)
         {
-            if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
-                ANDI2R(rdMapped, W0, 3);
-            if (size > 8)
-                ANDI2R(W0, W0, addressMask);
+            if (flags & memop_Store)
+                MOV(W1, rdMapped);
+            QuickCallFunction(X2, (void (*)())func);
+
+            if (!(flags & memop_Store))
+            {
+                if (size == 32)
+                {
+                    if (staticAddress & 0x3)
+                        ROR_(rdMapped, W0, (staticAddress & 0x3) << 3);
+                    else
+                        MOV(rdMapped, W0);
+                }
+                else
+                {
+                    if (flags & memop_SignExtend)
+                        SBFX(rdMapped, W0, 0, size);
+                    else
+                        UBFX(rdMapped, W0, 0, size);
+                }
+            }
         }
-        QuickCallFunction(X2, memFunc);
-        if (!(flags & memop_Store))
+        else
         {
-            if (inlinePreparation && !(flags & memop_Store) && size == 32)
+            if (Num == 0)
             {
-                if (constLocalROR32 == 4)
+                MOV(X1, RCPU);
+                if (flags & memop_Store)
                 {
-                    LSL(rdMapped, rdMapped, 3);
-                    RORV(rdMapped, W0, rdMapped);
+                    MOV(W2, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+                    }
                 }
-                else if (constLocalROR32 > 0)
-                    ROR_(rdMapped, W0, constLocalROR32 << 3);
                 else
-                    MOV(rdMapped, W0);
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+                    }
+                }
             }
-            else if (flags & memop_SignExtend)
+            else
             {
-                if (size == 16)
-                    SXTH(rdMapped, W0);
-                else if (size == 8)
-                    SXTB(rdMapped, W0);
+                if (flags & memop_Store)
+                {
+                    MOV(W1, rdMapped);
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowWrite7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowWrite7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowWrite7<u8>); break;
+                    }
+                }
                 else
-                    assert("What's wrong with you?");
+                {
+                    switch (size)
+                    {
+                    case 32: QuickCallFunction(X3, SlowRead7<u32>); break;
+                    case 16: QuickCallFunction(X3, SlowRead7<u16>); break;
+                    case 8: QuickCallFunction(X3, SlowRead7<u8>); break;
+                    }
+                }
             }
-            else
-                MOV(rdMapped, W0);
-            
-            if (CurInstr.Info.Branches())
+
+            if (!(flags & memop_Store))
             {
-                if (size < 32)
-                    printf("LDR size < 32 branching?\n");
-                Comp_JumpTo(rdMapped, Num == 0, false);
+                if (size == 32)
+                    MOV(rdMapped, W0);
+                else if (flags & memop_SignExtend)
+                    SBFX(rdMapped, W0, 0, size);
+                else
+                    UBFX(rdMapped, W0, 0, size);
             }
         }
     }
+
+    if (CurInstr.Info.Branches())
+    {
+        if (size < 32)
+            printf("LDR size < 32 branching?\n");
+        Comp_JumpTo(rdMapped, Num == 0, false);
+    }
 }
 
 void Compiler::A_Comp_MemWB()
@@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf()
 
 void Compiler::T_Comp_LoadPCRel()
 {
-    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+    u32 offset = ((CurInstr.Instr & 0xFF) << 2);
+    u32 addr = (R15 & ~0x2) + offset;
 
-    if (Config::JIT_LiteralOptimisations)
-    {
-        Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
-        Comp_AddCycles_CDI();
-    }
-    else
-    {
-        bool negative = addr < R15;
-        u32 abs = negative ? R15 - addr : addr - R15;
-        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
-    }
+    if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+        Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
 }
 
 void Compiler::T_Comp_MemSPRel()
@@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     if (regsCount == 0)
         return 0; // actually not the right behaviour TODO: fix me
 
-    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
-    if (store)
+    if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
     {
+        int flags = 0;
+        if (store)
+            flags |= memop_Store;
+        if (decrement)
+            flags |= memop_SubtractOffset;
+        Op2 offset = preinc ? Op2(4) : Op2(0);
+
+        Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+        return decrement ? -4 : 4;
+    }
+
+    if (store)
         Comp_AddCycles_CD();
+    else
+        Comp_AddCycles_CDI();
 
-        if (usermode && (regs & BitSet16(0x7f00)))
-            UBFX(W0, RCPSR, 0, 5);
+    int expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+    bool compileFastPath = Config::JIT_FastMemory
+        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget));
+
+    if (decrement)
+    {
+        SUB(W0, MapReg(rn), regsCount * 4);
+        ANDI2R(W0, W0, ~3);
+        preinc ^= true;
+    }
+    else
+    {
+        ANDI2R(W0, MapReg(rn), ~3);
+    }
+
+    LoadStorePatch patch;
+    if (compileFastPath)
+    {
+        ptrdiff_t fastPathStart = GetCodeOffset();
+        ptrdiff_t firstLoadStoreOffset;
+
+        bool firstLoadStore = true;
+
+        MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+        ADD(X1, X1, X0);
+
+        u32 offset = preinc ? 4 : 0;
+        BitSet16::Iterator it = regs.begin();
+
+        if (regsCount & 1)
+        {
+            int reg = *it;
+            it++;
+
+            ARM64Reg first = W3;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STR(INDEX_UNSIGNED, first, X1, offset);
+            else
+                LDR(INDEX_UNSIGNED, first, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+
+            offset += 4;
+        }
+
+        while (it != regs.end())
+        {
+            int reg = *it;
+            it++;
+            int nextReg = *it;
+            it++;
 
-        int i = regsCount - 1;
+            ARM64Reg first = W3, second = W4;
+            if (RegCache.LoadedRegs & (1 << reg))
+                first = MapReg(reg);
+            else if (store)
+                LoadReg(reg, first);
+            if (RegCache.LoadedRegs & (1 << nextReg))
+                second = MapReg(nextReg);
+            else if (store)
+                LoadReg(nextReg, second);
+
+            if (firstLoadStore)
+            {
+                firstLoadStoreOffset = GetCodeOffset();
+                firstLoadStore = false;
+            }
+
+            if (store)
+                STP(INDEX_SIGNED, first, second, X1, offset);
+            else
+                LDP(INDEX_SIGNED, first, second, X1, offset);
+
+            if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+                SaveReg(reg, first);
+            if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store)
+                SaveReg(nextReg, second);
+
+            offset += 8;
+        }
+
+        patch.PatchSize = GetCodeOffset() - fastPathStart;
+        patch.PatchOffset = fastPathStart - firstLoadStoreOffset;
+        SwapCodeRegion();
+        patch.PatchFunc = GetRXPtr();
+
+        LoadStorePatches[firstLoadStoreOffset] = patch;
+
+        ABI_PushRegisters({30});
+    }
+
+    int i = 0;
+
+    SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+    if (store)
+    {
+        if (usermode && (regs & BitSet16(0x7f00)))
+            UBFX(W5, RCPSR, 0, 5);
 
         BitSet16::Iterator it = regs.begin();
         while (it != regs.end())
@@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
             if (usermode && reg >= 8 && reg < 15)
             {
-                if (RegCache.Mapping[reg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << reg))
                     MOV(W3, MapReg(reg));
                 else
                     LoadReg(reg, W3);
@@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
             else if (!usermode && nextReg != regs.end())
             {
-                ARM64Reg first = W3;
-                ARM64Reg second = W4;
+                ARM64Reg first = W3, second = W4;
 
-                if (RegCache.Mapping[reg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << reg))
                     first = MapReg(reg);
                 else
                     LoadReg(reg, W3);
 
-                if (RegCache.Mapping[*nextReg] != INVALID_REG)
+                if (RegCache.LoadedRegs & (1 << *nextReg))
                     second = MapReg(*nextReg);
                 else
                     LoadReg(*nextReg, W4);
 
-                STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+                STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
 
-                i--;
+                i++;
                 it++;
             }
-            else if (RegCache.Mapping[reg] != INVALID_REG)
+            else if (RegCache.LoadedRegs & (1 << reg))
+            {
                 STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+            }
             else
             {
                 LoadReg(reg, W3);
                 STR(INDEX_UNSIGNED, W3, SP, i * 8);
             }
-            i--;
+            i++;
             it++;
         }
     }
-    if (decrement)
-    {
-        SUB(W0, MapReg(rn), regsCount * 4);
-        preinc ^= true;
-    }
-    else
-        MOV(W0, MapReg(rn));
+
     ADD(X1, SP, 0);
     MOVI2R(W2, regsCount);
 
-    BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+    if (Num == 0)
+    {
+        MOV(X3, RCPU);
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break;
+        }
+    }
+    else
+    {
+        switch (preinc * 2 | store)
+        {
+        case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break;
+        case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break;
+        case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break;
+        case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break;
+        }
+    }
 
     if (!store)
     {
-        Comp_AddCycles_CDI();
-
         if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
-            UBFX(W0, RCPSR, 0, 5);
+            UBFX(W5, RCPSR, 0, 5);
 
-        int i = regsCount - 1;
         BitSet16::Iterator it = regs.begin();
         while (it != regs.end())
         {
@@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 MOVI2R(W1, reg - 8);
                 BL(WriteBanked);
                 FixupBranch alreadyWritten = CBNZ(W4);
-                if (RegCache.Mapping[reg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << reg))
                     MOV(MapReg(reg), W3);
-                    RegCache.DirtyRegs |= 1 << reg;
-                }
                 else
                     SaveReg(reg, W3);
                 SetJumpTarget(alreadyWritten);
@@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             {
                 ARM64Reg first = W3, second = W4;
                 
-                if (RegCache.Mapping[reg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << reg))
                     first = MapReg(reg);
-                    if (reg != 15)
-                        RegCache.DirtyRegs |= 1 << reg;
-                }
-                if (RegCache.Mapping[*nextReg] != INVALID_REG)
-                {
+                if (RegCache.LoadedRegs & (1 << *nextReg))
                     second = MapReg(*nextReg);
-                    if (*nextReg != 15)
-                        RegCache.DirtyRegs |= 1 << *nextReg;
-                }
 
-                LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+                LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
 
                 if (first == W3)
                     SaveReg(reg, W3);
@@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     SaveReg(*nextReg, W4);
 
                 it++;
-                i--;
+                i++;
             }
-            else if (RegCache.Mapping[reg] != INVALID_REG)
+            else if (RegCache.LoadedRegs & (1 << reg))
             {
                 ARM64Reg mapped = MapReg(reg);
                 LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
-
-                if (reg != 15)
-                    RegCache.DirtyRegs |= 1 << reg;
             }
             else
             {
@@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
             }
 
             it++;
-            i--;
+            i++;
         }
     }
     ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
 
+    if (compileFastPath)
+    {
+        ABI_PopRegisters({30});
+        RET();
+
+        FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr());
+        SwapCodeRegion();
+    }
+
     if (!store && regs[15])
     {
         ARM64Reg mapped = MapReg(15);
diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h
new file mode 100644
index 0000000..513c103
--- /dev/null
+++ b/src/ARMJIT_Compiler.h
@@ -0,0 +1,12 @@
+#if defined(__x86_64__)
+#include "ARMJIT_x64/ARMJIT_Compiler.h"
+#elif defined(__aarch64__)
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#else
+#error "The current target platform doesn't have a JIT backend"
+#endif
+
+namespace ARMJIT
+{
+extern Compiler* JITCompiler;
+}
\ No newline at end of file
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 4e45760..19684c4 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -3,8 +3,11 @@
 
 #include "types.h"
 #include <stdint.h>
+#include <string.h>
+#include <assert.h>
 
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 // here lands everything which doesn't fit into ARMJIT.h
 // where it would be included by pretty much everything
@@ -160,8 +163,8 @@ public:
 		Data.SetLength(numAddresses * 2 + numLiterals);
 	}
 
-	u32 PseudoPhysicalAddr;
-
+	u32 StartAddr;
+	u32 StartAddrLocal;
 	u32 InstrHash, LiteralHash;
 	u8 Num;
 	u16 NumAddresses;
@@ -175,28 +178,8 @@ public:
 	{ return &Data[NumAddresses]; }
 	u32* Literals()
 	{ return &Data[NumAddresses * 2]; }
-	u32* Links()
-	{ return &Data[NumAddresses * 2 + NumLiterals]; }
-
-	u32 NumLinks()
-	{ return Data.Length - NumAddresses * 2 - NumLiterals; }
-
-	void AddLink(u32 link)
-	{
-		Data.Add(link);
-	}
-
-	void ResetLinks()
-	{
-		Data.SetLength(NumAddresses * 2 + NumLiterals);
-	}
 
 private:
-	/*
-		0..<NumInstrs - the instructions of the block
-		NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located
-			(atleast one, the pseudo physical address of the block)
-	*/
 	TinyVector<u32> Data;
 };
 
@@ -207,45 +190,32 @@ struct __attribute__((packed)) AddressRange
 	u32 Code;
 };
 
-extern AddressRange CodeRanges[ExeMemSpaceSize / 512];
 
 typedef void (*InterpreterFunc)(ARM* cpu);
 extern InterpreterFunc InterpretARM[];
 extern InterpreterFunc InterpretTHUMB[];
 
-extern u8 MemoryStatus9[0x800000];
-extern u8 MemoryStatus7[0x800000];
-
 extern TinyVector<u32> InvalidLiterals;
 
-void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
-
-template <u32 Num>
-void LinkBlock(ARM* cpu, u32 codeOffset);
+extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count];
 
-enum
+inline bool PageContainsCode(AddressRange* range)
 {
-	memregion_Other = 0,
-	memregion_ITCM,
-	memregion_DTCM,
-	memregion_BIOS9,
-	memregion_MainRAM,
-	memregion_SWRAM9,
-	memregion_SWRAM7,
-	memregion_IO9,
-	memregion_VRAM,
-	memregion_BIOS7,
-	memregion_WRAM7,
-	memregion_IO7,
-	memregion_Wifi,
-	memregion_VWRAM,
-};
+	for (int i = 0; i < 8; i++)
+	{
+		if (range[i].Blocks.Length > 0)
+			return true;
+	}
+	return false;
+}
+
+u32 LocaliseCodeAddress(u32 num, u32 addr);
 
-int ClassifyAddress9(u32 addr);
-int ClassifyAddress7(u32 addr);
+template <u32 Num>
+void LinkBlock(ARM* cpu, u32 codeOffset);
 
-template <typename T> T SlowRead9(ARMv5* cpu, u32 addr);
-template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val);
+template <typename T> T SlowRead9(u32 addr, ARMv5* cpu);
+template <typename T> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
 template <typename T> T SlowRead7(u32 addr);
 template <typename T> void SlowWrite7(u32 addr, T val);
 
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
new file mode 100644
index 0000000..162827d
--- /dev/null
+++ b/src/ARMJIT_Memory.cpp
@@ -0,0 +1,822 @@
+#ifdef __SWITCH__
+#include "switch/compat_switch.h"
+#endif
+
+#include "ARMJIT_Memory.h"
+
+#include "ARMJIT_Internal.h"
+#include "ARMJIT_Compiler.h"
+
+#include "GPU.h"
+#include "GPU3D.h"
+#include "Wifi.h"
+#include "NDSCart.h"
+#include "SPU.h"
+
+#include <malloc.h>
+
+/*
+	We're handling fastmem here.
+
+	Basically we're repurposing a big piece of virtual memory
+	and map the memory regions as they're structured on the DS
+	in it.
+
+	On most systems you have a single piece of main ram, 
+	maybe some video ram and faster cache RAM and that's about it.
+	Here we have not only a lot more different memory regions,
+	but also two address spaces. Not only that but they all have
+	mirrors (the worst case is 16kb SWRAM which is mirrored 1024x).
+
+	We handle this by only mapping those regions which are actually
+	used and by praying the games don't go wild.
+
+	Beware, this file is full of platform specific code.
+
+*/
+
+namespace ARMJIT_Memory
+{
+#ifdef __aarch64__
+struct FaultDescription
+{
+	u64 IntegerRegisters[33];
+	u64 FaultAddr;
+
+	u32 GetEmulatedAddr()
+	{
+		// now this is podracing
+		return (u32)IntegerRegisters[0];
+	}
+	u64 RealAddr()
+	{
+		return FaultAddr;
+	}
+
+	u64 GetPC()
+	{
+		return IntegerRegisters[32];
+	}
+
+	void RestoreAndRepeat(s64 offset);
+};
+#else
+struct FaultDescription
+{
+	u64 GetPC()
+	{
+		return 0;
+	}
+	
+	u32 GetEmulatedAddr()
+	{
+		return 0;
+	}
+	u64 RealAddr()
+	{
+		return 0;
+	}
+
+	void RestoreAndRepeat(s64 offset);
+};
+#endif
+
+void FaultHandler(FaultDescription* faultDesc);
+}
+
+
+#ifdef __aarch64__
+
+extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
+
+#endif
+
+#ifdef __SWITCH__
+// with LTO the symbols seem to be not properly overriden
+// if they're somewhere else
+
+extern "C"
+{
+extern char __start__;
+extern char __rodata_start;
+
+alignas(16) u8 __nx_exception_stack[0x8000];
+u64 __nx_exception_stack_size = 0x8000;
+
+void __libnx_exception_handler(ThreadExceptionDump* ctx)
+{
+	ARMJIT_Memory::FaultDescription desc;
+	memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29);
+	desc.IntegerRegisters[29] = ctx->fp.x;
+	desc.IntegerRegisters[30] = ctx->lr.x;
+	desc.IntegerRegisters[31] = ctx->sp.x;
+	desc.IntegerRegisters[32] = ctx->pc.x;
+
+	ARMJIT_Memory::FaultHandler(&desc);
+
+	if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start)
+	{
+		printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
+			ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x);
+	}
+	else
+	{
+		printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
+	}
+}
+
+}
+#endif
+
+namespace ARMJIT_Memory
+{
+
+#ifdef __aarch64__
+void FaultDescription::RestoreAndRepeat(s64 offset)
+{
+	IntegerRegisters[32] += offset;
+
+	ARM_RestoreContext(IntegerRegisters);
+}
+#else
+void FaultDescription::RestoreAndRepeat(s64 offset)
+{
+	
+}
+#endif
+
+void* FastMem9Start, *FastMem7Start;
+
+const u32 MemoryTotalSize =
+	NDS::MainRAMSize
+	+ NDS::SharedWRAMSize
+	+ NDS::ARM7WRAMSize
+	+ DTCMPhysicalSize;
+
+const u32 MemBlockMainRAMOffset = 0;
+const u32 MemBlockSWRAMOffset = NDS::MainRAMSize;
+const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize;
+const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize;
+
+const u32 OffsetsPerRegion[memregions_Count] =
+{
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockDTCMOffset,
+	UINT32_MAX,
+	MemBlockMainRAMOffset,
+	MemBlockSWRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockARM7WRAMOffset,
+	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+};
+
+enum
+{
+	memstate_Unmapped,
+	memstate_MappedRW,
+	// on switch this is unmapped as well
+	memstate_MappedProtected,
+};
+
+u8 MappingStatus9[1 << (32-12)];
+u8 MappingStatus7[1 << (32-12)];
+
+#ifdef __SWITCH__
+u8* MemoryBase;
+u8* MemoryBaseCodeMem;
+#else
+u8* MemoryBase;
+#endif
+
+bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), 
+		(u64)(MemoryBaseCodeMem + offset), size));
+	return R_SUCCEEDED(r);
+#endif
+}
+
+bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#ifdef __SWITCH__
+	Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(),
+		(u64)(MemoryBaseCodeMem + offset), size);
+	printf("%x\n", r);
+	return R_SUCCEEDED(r);
+#endif
+}
+
+struct Mapping
+{
+	u32 Addr;
+	u32 Size, LocalOffset;
+	u32 Num;
+
+	void Unmap(int region)
+	{
+		bool skipDTCM = Num == 0 && region != memregion_DTCM;
+		u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7;
+		u32 offset = 0;
+		while (offset < Size)
+		{
+			if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase)
+			{
+				offset += NDS::ARM9->DTCMSize;
+				printf("%x skip\n", NDS::ARM9->DTCMSize);
+			}
+			else
+			{
+				u32 segmentOffset = offset;
+				u8 status = statuses[(Addr + offset) >> 12];
+				while (statuses[(Addr + offset) >> 12] == status
+					&& offset < Size
+					&& (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase))
+				{
+					assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped);
+					statuses[(Addr + offset) >> 12] = memstate_Unmapped;
+					offset += 0x1000;
+				}
+
+				if (status == memstate_MappedRW)
+				{
+					u32 segmentSize = offset - segmentOffset;
+					printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
+					assert(success);
+				}
+			}
+		}
+	}
+};
+ARMJIT::TinyVector<Mapping> Mappings[memregions_Count];
+
+void SetCodeProtection(int region, u32 offset, bool protect)
+{
+	offset &= ~0xFFF;
+	printf("set code protection %d %x %d\n", region, offset, protect);
+
+	for (int i = 0; i < Mappings[region].Length; i++)
+	{
+		Mapping& mapping = Mappings[region][i];
+
+		u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset);
+		if (mapping.Num == 0
+			&& region != memregion_DTCM 
+			&& effectiveAddr >= NDS::ARM9->DTCMBase
+			&& effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+			continue;
+
+		u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7);
+
+		printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num);
+		assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected));
+		states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW;
+
+		bool success;
+		if (protect)
+			success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		else
+			success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
+		assert(success);
+	}
+}
+
+void RemapDTCM(u32 newBase, u32 newSize)
+{
+	// this first part could be made more efficient
+	// by unmapping DTCM first and then map the holes
+	u32 oldDTCMBase = NDS::ARM9->DTCMBase;
+	u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize;
+
+	u32 newEnd = newBase + newSize;
+
+	printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd);
+	// unmap all regions containing the old or the current DTCM mapping
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		if (region == memregion_DTCM)
+			continue;
+
+		for (int i = 0; i < Mappings[region].Length;)
+		{
+			Mapping& mapping = Mappings[region][i];
+
+			u32 start = mapping.Addr;
+			u32 end = mapping.Addr + mapping.Size;
+
+			printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset);
+
+			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end));
+			bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end));
+
+			if (mapping.Num == 0 && (oldOverlap || newOverlap))
+			{
+				mapping.Unmap(region);
+				Mappings[region].Remove(i);
+			}
+			else
+			{
+				i++;
+			}
+		}
+	}
+
+	for (int i = 0; i < Mappings[memregion_DTCM].Length; i++)
+	{
+		Mappings[memregion_DTCM][i].Unmap(memregion_DTCM);
+	}
+	Mappings[memregion_DTCM].Clear();
+}
+
+void RemapSWRAM()
+{
+	printf("remapping SWRAM\n");
+	for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++)
+	{
+		Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM);
+	}
+	Mappings[memregion_SWRAM].Clear();
+	for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++)
+	{
+		Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7);
+	}
+	Mappings[memregion_WRAM7].Clear();
+}
+
+bool MapAtAddress(u32 addr)
+{
+	u32 num = NDS::CurCPU;
+
+	int region = num == 0
+		? ClassifyAddress9(addr)
+		: ClassifyAddress7(addr);
+
+	if (!IsMappable(region))
+		return false;
+
+	u32 mappingStart, mappingSize, memoryOffset, memorySize;
+	bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize);
+
+	if (!isMapped)
+		return false;
+
+	// this calculation even works with DTCM
+	// which doesn't have to be aligned to it's own size
+	u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart;
+
+	u8* states = num == 0 ? MappingStatus9 : MappingStatus7;
+	printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset);
+	bool isExecutable = ARMJIT::CodeMemRegions[region];
+
+	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset;
+
+	// this overcomplicated piece of code basically just finds whole pieces of code memory
+	// which can be mapped
+	u32 offset = 0;	
+	bool skipDTCM = num == 0 && region != memregion_DTCM;
+	while (offset < memorySize)
+	{
+		if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase)
+		{
+			offset += NDS::ARM9->DTCMSize;
+		}
+		else
+		{
+			u32 sectionOffset = offset;
+			bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]);
+			while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode)
+				&& offset < memorySize
+				&& (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase))
+			{
+				assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped);
+				states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW;
+				offset += 0x1000;
+			}
+
+			u32 sectionSize = offset - sectionOffset;
+
+			if (!hasCode)
+			{
+				printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]);
+				bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize);
+				assert(succeded);
+			}
+		}
+	}
+
+	Mapping mapping{mirrorStart, memorySize, memoryOffset, num};
+	Mappings[region].Add(mapping);
+
+	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1);
+
+	return true;
+}
+
+void FaultHandler(FaultDescription* faultDesc)
+{
+	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC()))
+	{
+		bool rewriteToSlowPath = true;
+
+		u32 addr = faultDesc->GetEmulatedAddr();
+
+		if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped)
+			rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr());
+
+		s64 offset = 0;
+		if (rewriteToSlowPath)
+		{
+			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC());
+		}
+		faultDesc->RestoreAndRepeat(offset);
+	}
+}
+
+void Init()
+{
+#if defined(__SWITCH__)
+    MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize);
+	MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize);
+
+    bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        (u64)MemoryBase, MemoryTotalSize));
+    assert(succeded);
+	succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, 
+        MemoryTotalSize, Perm_Rw));
+	assert(succeded);
+
+	// 8 GB of address space, just don't ask...
+	FastMem9Start = virtmemReserve(0x100000000);
+	assert(FastMem9Start);
+	FastMem7Start = virtmemReserve(0x100000000);
+	assert(FastMem7Start);
+
+	NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset;
+#else
+	MemoryBase = new u8[MemoryTotalSize];
+
+	NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset;
+#endif
+}
+
+void DeInit()
+{
+#if defined(__SWITCH__)
+	virtmemFree(FastMem9Start, 0x100000000);
+	virtmemFree(FastMem7Start, 0x100000000);
+
+    svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize);
+	virtmemFree(MemoryBaseCodeMem, MemoryTotalSize);
+    free(MemoryBase);
+#else
+	delete[] MemoryBase;
+#endif
+}
+
+void Reset()
+{
+	for (int region = 0; region < memregions_Count; region++)
+	{
+		for (int i = 0; i < Mappings[region].Length; i++)
+			Mappings[region][i].Unmap(region);
+		Mappings[region].Clear();
+	}
+
+	for (int i = 0; i < sizeof(MappingStatus9); i++)
+	{
+		assert(MappingStatus9[i] == memstate_Unmapped);
+		assert(MappingStatus7[i] == memstate_Unmapped);
+	}
+
+	printf("done resetting jit mem\n");
+}
+
+bool IsMappable(int region)
+{
+	return OffsetsPerRegion[region] != UINT32_MAX;
+}
+
+bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize)
+{
+	memoryOffset = 0;
+	switch (region)
+	{
+	case memregion_ITCM:
+		if (num == 0)
+		{
+			mappingStart = 0;
+			mappingSize = NDS::ARM9->ITCMSize;
+			memorySize = ITCMPhysicalSize;
+			return true;
+		}
+		return false;
+	case memregion_DTCM:
+		if (num == 0)
+		{
+			mappingStart = NDS::ARM9->DTCMBase;
+			mappingSize = NDS::ARM9->DTCMSize;
+			memorySize = DTCMPhysicalSize;
+			return true;
+		}
+		return false;
+	case memregion_BIOS9:
+		if (num == 0)
+		{
+			mappingStart = 0xFFFF0000;
+			mappingSize = 0x10000;
+			memorySize = 0x1000;
+			return true;
+		}
+		return false;
+	case memregion_MainRAM:
+		mappingStart = 0x2000000;
+		mappingSize = 0x1000000;
+		memorySize = NDS::MainRAMSize;
+		return true;
+	case memregion_SWRAM:
+		mappingStart = 0x3000000;
+		if (num == 0 && NDS::SWRAM_ARM9.Mem)
+		{
+			mappingSize = 0x1000000;
+			memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM;
+			memorySize = NDS::SWRAM_ARM9.Mask + 1;
+			return true;
+		}
+		else if (num == 1 && NDS::SWRAM_ARM7.Mem)
+		{
+			mappingSize = 0x800000;
+			memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM;
+			memorySize = NDS::SWRAM_ARM7.Mask + 1;
+			return true;
+		}
+		return false;
+	case memregion_VRAM:
+		if (num == 0)
+		{
+			// this is a gross simplification
+			// mostly to make code on vram working
+			// it doesn't take any of the actual VRAM mappings into account
+			mappingStart = 0x6000000;
+			mappingSize = 0x1000000;
+			memorySize = 0x100000;
+			return true;
+		}
+		return false;
+	case memregion_BIOS7:
+		if (num == 1)
+		{
+			mappingStart = 0;
+			mappingSize = 0x4000;
+			memorySize = 0x4000;
+			return true;
+		}
+		return false;
+	case memregion_WRAM7:
+		if (num == 1)
+		{
+			if (NDS::SWRAM_ARM7.Mem)
+			{
+				mappingStart = 0x3800000;
+				mappingSize = 0x800000;
+			}
+			else
+			{
+				mappingStart = 0x3000000;
+				mappingSize = 0x1000000;
+			}
+			memorySize = NDS::ARM7WRAMSize;
+			return true;
+		}
+		return false;
+	case memregion_VWRAM:
+		if (num == 1)
+		{
+			mappingStart = 0x6000000;
+			mappingSize = 0x1000000;
+			memorySize = 0x20000;
+			return true;
+		}
+		return false;
+	default:
+		// for the JIT we don't are about the rest
+		return false;
+	}
+}
+
+int ClassifyAddress9(u32 addr)
+{
+	if (addr < NDS::ARM9->ITCMSize)
+		return memregion_ITCM;
+	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+		return memregion_DTCM;
+	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		return memregion_BIOS9;
+	else
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x02000000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM9.Mem)
+				return memregion_SWRAM;
+			else
+				return memregion_Other;
+		case 0x04000000:
+			return memregion_IO9;
+		case 0x06000000:
+			return memregion_VRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+int ClassifyAddress7(u32 addr)
+{
+	if (addr < 0x00004000)
+		return memregion_BIOS7;
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x02000000:
+		case 0x02800000:
+			return memregion_MainRAM;
+		case 0x03000000:
+			if (NDS::SWRAM_ARM7.Mem)
+				return memregion_SWRAM;
+			else
+				return memregion_WRAM7;
+		case 0x03800000:
+			return memregion_WRAM7;
+		case 0x04000000:
+			return memregion_IO7;
+		case 0x04800000:
+			return memregion_Wifi;
+		case 0x06000000:
+		case 0x06800000:
+			return memregion_VWRAM;
+		}
+	}
+	return memregion_Other;
+}
+
+void WifiWrite32(u32 addr, u32 val)
+{
+	Wifi::Write(addr, val & 0xFFFF);
+	Wifi::Write(addr + 2, val >> 16);
+}
+
+u32 WifiRead32(u32 addr)
+{
+	return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16);
+}
+
+template <typename T>
+void VRAMWrite(u32 addr, T val)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return;
+	case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return;
+	case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return;
+	case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return;
+	default: GPU::WriteVRAM_LCDC<T>(addr, val); return;
+	}
+}
+template <typename T>
+T VRAMRead(u32 addr)
+{
+	switch (addr & 0x00E00000)
+	{
+	case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr);
+	case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr);
+	case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr);
+	case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr);
+	default: return GPU::ReadVRAM_LCDC<T>(addr);
+	}
+}
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
+{
+	if (cpu->Num == 0)
+	{
+		switch (addr & 0xFF000000)
+		{
+		case 0x04000000:
+			if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11))
+				return (void*)NDSCart::ReadROMData;
+
+			/*
+				unfortunately we can't map GPU2D this way
+				since it's hidden inside an object
+
+				though GPU3D registers are accessed much more intensive
+			*/
+			if (addr >= 0x04000320 && addr < 0x040006A4)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)GPU3D::Read8;		
+				case 9: return (void*)GPU3D::Write8;		
+				case 16: return (void*)GPU3D::Read16;
+				case 17: return (void*)GPU3D::Write16;
+				case 32: return (void*)GPU3D::Read32;
+				case 33: return (void*)GPU3D::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM9IORead8;
+			case 9: return (void*)NDS::ARM9IOWrite8;
+			case 16: return (void*)NDS::ARM9IORead16;
+			case 17: return (void*)NDS::ARM9IOWrite16;
+			case 32: return (void*)NDS::ARM9IORead32;
+			case 33: return (void*)NDS::ARM9IOWrite32;
+			}
+			break;
+		case 0x06000000:
+			switch (size | store)
+			{
+			case 8: return (void*)VRAMRead<u8>;		
+			case 9: return NULL;
+			case 16: return (void*)VRAMRead<u16>;
+			case 17: return (void*)VRAMWrite<u16>;
+			case 32: return (void*)VRAMRead<u32>;
+			case 33: return (void*)VRAMWrite<u32>;
+			}
+			break;
+		}
+	}
+	else
+	{
+		switch (addr & 0xFF800000)
+		{
+		case 0x04000000:
+			if (addr >= 0x04000400 && addr < 0x04000520)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)SPU::Read8;		
+				case 9: return (void*)SPU::Write8;		
+				case 16: return (void*)SPU::Read16;
+				case 17: return (void*)SPU::Write16;
+				case 32: return (void*)SPU::Read32;
+				case 33: return (void*)SPU::Write32;
+				}
+			}
+
+			switch (size | store)
+			{
+			case 8: return (void*)NDS::ARM7IORead8;
+			case 9: return (void*)NDS::ARM7IOWrite8;		
+			case 16: return (void*)NDS::ARM7IORead16;
+			case 17: return (void*)NDS::ARM7IOWrite16;
+			case 32: return (void*)NDS::ARM7IORead32;
+			case 33: return (void*)NDS::ARM7IOWrite32;
+			}
+			break;
+		case 0x04800000:
+			if (addr < 0x04810000 && size >= 16)
+			{
+				switch (size | store)
+				{
+				case 16: return (void*)Wifi::Read;
+				case 17: return (void*)Wifi::Write;
+				case 32: return (void*)WifiRead32;
+				case 33: return (void*)WifiWrite32;
+				}
+			}
+			break;
+		case 0x06000000:
+		case 0x06800000:
+			switch (size | store)
+			{
+			case 8: return (void*)GPU::ReadVRAM_ARM7<u8>;
+			case 9: return (void*)GPU::WriteVRAM_ARM7<u8>;
+			case 16: return (void*)GPU::ReadVRAM_ARM7<u16>;
+			case 17: return (void*)GPU::WriteVRAM_ARM7<u16>;
+			case 32: return (void*)GPU::ReadVRAM_ARM7<u32>;
+			case 33: return (void*)GPU::WriteVRAM_ARM7<u32>;
+			}
+		}
+	}
+	return NULL;
+}
+
+}
\ No newline at end of file
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
new file mode 100644
index 0000000..1a59d98
--- /dev/null
+++ b/src/ARMJIT_Memory.h
@@ -0,0 +1,53 @@
+#ifndef ARMJIT_MEMORY
+#define ARMJIT_MEMORY
+
+#include "types.h"
+
+#include "ARM.h"
+
+namespace ARMJIT_Memory
+{
+
+extern void* FastMem9Start;
+extern void* FastMem7Start;
+
+void Init();
+void DeInit();
+
+void Reset();
+
+enum
+{
+	memregion_Other = 0,
+	memregion_ITCM,
+	memregion_DTCM,
+	memregion_BIOS9,
+	memregion_MainRAM,
+	memregion_SWRAM,
+	memregion_IO9,
+	memregion_VRAM,
+	memregion_BIOS7,
+	memregion_WRAM7,
+	memregion_IO7,
+	memregion_Wifi,
+	memregion_VWRAM,
+	memregions_Count
+};
+
+int ClassifyAddress9(u32 addr);
+int ClassifyAddress7(u32 addr);
+
+bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize);
+
+bool IsMappable(int region);
+
+void RemapDTCM(u32 newBase, u32 newSize);
+void RemapSWRAM();
+
+void SetCodeProtection(int region, u32 offset, bool protect);
+
+void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index fd3fb70..34c1c91 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -301,24 +301,6 @@ Compiler::Compiler()
         RET();
     }
 
-    {
-        CPSRDirty = true;
-        BranchStub[0] = GetWritableCodePtr();
-        SaveCPSR();
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        CALL((u8*)ARMJIT::LinkBlock<0>);
-        LoadCPSR();
-        JMP((u8*)ARM_Ret, true);
-
-        CPSRDirty = true;
-        BranchStub[1] = GetWritableCodePtr();
-        SaveCPSR();
-        MOV(64, R(ABI_PARAM1), R(RCPU));
-        CALL((u8*)ARMJIT::LinkBlock<1>);
-        LoadCPSR();
-        JMP((u8*)ARM_Ret, true);
-    }
-
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -520,6 +502,11 @@ void Compiler::Reset()
     FarCode = FarStart;
 }
 
+bool Compiler::IsJITFault(u64 addr)
+{
+    return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory);
+}
+
 void Compiler::Comp_SpecialBranchBehaviour(bool taken)
 {
     if (taken && CurInstr.BranchFlags & branch_IdleBranch)
@@ -531,32 +518,11 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken)
         RegCache.PrepareExit();
 
         SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
-
-        if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch)
-            && (!taken || (CurInstr.BranchFlags & branch_StaticTarget)))
-        {
-            FixupBranch ret = J_CC(CC_S);
-            CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
-            FixupBranch ret2 = J_CC(CC_NZ);
-
-            u8* rewritePart = GetWritableCodePtr();
-            NOP(5);
-
-            MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
-            JMP((u8*)BranchStub[Num], true);
-
-            SetJumpTarget(ret);
-            SetJumpTarget(ret2);
-            JMP((u8*)ARM_Ret, true);
-        }
-        else
-        {
-            JMP((u8*)&ARM_Ret, true);
-        }
+        JMP((u8*)&ARM_Ret, true);
     }
 }
 
-JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
 {
     if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess...
     {
@@ -575,7 +541,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     CodeRegion = instrs[0].Addr >> 24;
     CurCPU = cpu;
     // CPSR might have been modified in a previous block
-    CPSRDirty = Config::JIT_BrancheOptimisations == 2;
+    CPSRDirty = false;
 
     JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();
 
@@ -685,31 +651,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     RegCache.Flush();
 
     SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
-
-    if (Config::JIT_BrancheOptimisations == 2
-        && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch)
-        && (!instrs[instrsCount - 1].Info.Branches()
-        || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken
-        || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget)))
-    {
-        FixupBranch ret = J_CC(CC_S);
-        CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0));
-        FixupBranch ret2 = J_CC(CC_NZ);
-
-        u8* rewritePart = GetWritableCodePtr();
-        NOP(5);
-
-        MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart));
-        JMP((u8*)BranchStub[Num], true);
-
-        SetJumpTarget(ret);
-        SetJumpTarget(ret2);
-        JMP((u8*)ARM_Ret, true);
-    }
-    else
-    {
-        JMP((u8*)ARM_Ret, true);
-    }
+    JMP((u8*)ARM_Ret, true);
 
     /*FILE* codeout = fopen("codeout", "a");
     fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
@@ -720,22 +662,6 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F
     return res;
 }
 
-void Compiler::LinkBlock(u32 offset, JitBlockEntry entry)
-{
-    u8* curPtr = GetWritableCodePtr();
-    SetCodePtr(ResetStart + offset);
-    JMP((u8*)entry, true);
-    SetCodePtr(curPtr);
-}
-
-void Compiler::UnlinkBlock(u32 offset)
-{
-    u8* curPtr = GetWritableCodePtr();
-    SetCodePtr(ResetStart + offset);
-    NOP(5);
-    SetCodePtr(curPtr);
-}
-
 void Compiler::Comp_AddCycles_C(bool forceNonConstant)
 {
     s32 cycles = Num ?
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index f2fc301..09ac257 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -52,10 +52,7 @@ public:
 
     void Reset();
 
-    void LinkBlock(u32 offset, JitBlockEntry entry);
-    void UnlinkBlock(u32 offset);
-
-    JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+    JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
 
     void LoadReg(int reg, Gen::X64Reg nativeReg);
     void SaveReg(int reg, Gen::X64Reg nativeReg);
@@ -202,6 +199,10 @@ public:
         SetCodePtr(FarCode);
     }
 
+    bool IsJITFault(u64 addr);
+
+    s32 RewriteMemAccess(u64 pc);
+
     u8* FarCode;
     u8* NearCode;
     u32 FarSize;
@@ -216,8 +217,6 @@ public:
     bool Exit;
     bool IrregularCycles;
 
-    void* BranchStub[2];
-
     void* ReadBanked;
     void* WriteBanked;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index cf0bd23..0bf2f83 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -15,6 +15,11 @@ int squeezePointer(T* ptr)
     return truncated;
 }
 
+s32 Compiler::RewriteMemAccess(u64 pc)
+{
+    return 0;
+}
+
 /*
     According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
     of all memory load and store instructions always access addresses in the same region as
@@ -27,14 +32,15 @@ int squeezePointer(T* ptr)
 
 bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr)
 {
-    u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
+    return false;
+    //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr);
 
-    int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
+    /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr);
     if (invalidLiteralIdx != -1)
     {
         InvalidLiterals.Remove(invalidLiteralIdx);
         return false;
-    }
+    }*/
 
     u32 val;
     // make sure arm7 bios is accessible
@@ -95,7 +101,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
             staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
         OpArg rdMapped = MapReg(rd);
 
-        if (!addrIsStatic)
+        if (true)
         {
             OpArg rnMapped = MapReg(rn);
             if (Thumb && rn == 15)
@@ -145,7 +151,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 MOV(32, rnMapped, R(finalAddr));
         }
 
-        int expectedTarget = Num == 0
+        /*int expectedTarget = Num == 0
             ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
             : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
         if (CurInstr.Cond() < 0xE)
@@ -184,8 +190,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
 
         if (addrIsStatic && compileSlowPath)
             MOV(32, R(RSCRATCH3), Imm32(staticAddress));
-
-        if (compileFastPath)
+*/
+        /*if (compileFastPath)
         {
             FixupBranch slowPath;
             if (compileSlowPath)
@@ -357,15 +363,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                 SetJumpTarget(slowPath);
             }
         }
-
-        if (compileSlowPath)
+*/
+        if (true)
         {
             PushRegs(false);
 
             if (Num == 0)
             {
-                MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
-                MOV(64, R(ABI_PARAM1), R(RCPU));
+                MOV(64, R(ABI_PARAM2), R(RCPU));
+                if (ABI_PARAM1 != RSCRATCH3)
+                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
                 if (flags & memop_Store)
                 {
                     MOV(32, R(ABI_PARAM3), rdMapped);
@@ -423,13 +430,13 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz
                     MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
             }
         }
-
+/*
         if (compileFastPath && compileSlowPath)
         {
             FixupBranch ret = J(true);
             SwitchToNearCode();
             SetJumpTarget(ret);
-        }
+        }*/
 
         if (!(flags & memop_Store) && rd == 15)
         {
@@ -458,7 +465,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
 #endif
     u32 allocOffset = stackAlloc - regsCount * 8;
-
+/*
     int expectedTarget = Num == 0
         ? ClassifyAddress9(CurInstr.DataRegion)
         : ClassifyAddress7(CurInstr.DataRegion);
@@ -479,7 +486,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     default:
         break;
     }
-
+*/
     if (!store)
         Comp_AddCycles_CDI();
     else
@@ -492,7 +499,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     }
     else
         MOV(32, R(RSCRATCH4), MapReg(rn));
-
+/*
     if (compileFastPath)
     {
         assert(!usermode);
@@ -570,7 +577,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
         SwitchToFarCode();
         SetJumpTarget(slowPath);
-    }
+    }*/
 
     if (!store)
     {
@@ -696,13 +703,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
     
         PopRegs(false);
     }
-
+/*
     if (compileFastPath)
     {
         FixupBranch ret = J(true);
         SwitchToNearCode();
         SetJumpTarget(ret);
-    }
+    }*/
 
     if (!store && regs[15])
     {
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index b50e821..ccec951 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -206,15 +206,14 @@ enum {
     T_ReadR14       = 1 << 13,
     T_WriteR14      = 1 << 14,
 
-    T_PopPC         = 1 << 15,
-
-    T_SetNZ         = 1 << 16,
-    T_SetCV         = 1 << 17,
-    T_SetMaybeC     = 1 << 18,
-    T_ReadC         = 1 << 19,
-    T_SetC          = 1 << 20,
+    T_SetNZ         = 1 << 15,
+    T_SetCV         = 1 << 16,
+    T_SetMaybeC     = 1 << 17,
+    T_ReadC         = 1 << 18,
+    T_SetC          = 1 << 19,
     
-    T_WriteMem      = 1 << 21,
+    T_WriteMem      = 1 << 20,
+    T_LoadMem       = 1 << 21,
 };
 
 const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM);
@@ -256,31 +255,31 @@ const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL);
 const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL);
 const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP);
 
-const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL);
+const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL);
 
 const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG);
 const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG);
-const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG);
-const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG);
+const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG);
+const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG);
 const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG);
-const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG);
-const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG);
-const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG);
+const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG);
+const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG);
+const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG);
 
 const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM);
-const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM);
+const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM);
 const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM);
-const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM);
+const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM);
 const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM);
-const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM);
+const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM);
 
 const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL);
-const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL);
+const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL);
 
 const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH);
-const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP);
+const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP);
 
-const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA);
+const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA);
 const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA);
 
 const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND);
@@ -347,7 +346,7 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_BranchAlways)
             res.DstRegs |= (1 << 15);
 
-        if (data & T_PopPC && instr & (1 << 8))
+        if (res.Kind == tk_POP && instr & (1 << 8))
             res.DstRegs |= 1 << 15;
 
         if (data & T_SetNZ)
@@ -364,11 +363,18 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & T_WriteMem)
             res.SpecialKind = special_WriteMem;
         
-        if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+        if (data & T_LoadMem)
         {
-            if (!Config::JIT_LiteralOptimisations)
-                res.SrcRegs |= 1 << 15;
-            res.SpecialKind = special_LoadLiteral;
+            if (res.Kind == tk_LDR_PCREL)
+            {
+                if (!Config::JIT_LiteralOptimisations)
+                    res.SrcRegs |= 1 << 15;
+                res.SpecialKind = special_LoadLiteral;
+            }
+            else
+            {
+                res.SpecialKind = special_LoadMem;
+            }
         }
 
         if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
@@ -401,11 +407,17 @@ Info Decode(bool thumb, u32 num, u32 instr)
         else if ((instr >> 28) == 0xF)
             data = ak(ak_Nop);
 
-        if (data & A_UnkOnARM7 && num != 0)
+        if (data & A_UnkOnARM7 && num == 1)
             data = A_UNK;
 
         res.Kind = (data >> 22) & 0x1FF;
 
+        if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1)
+        {
+            data = ak(ak_Nop);
+            res.Kind = ak_Nop;
+        }
+
         if (res.Kind == ak_MCR)
         {
             u32 cn = (instr >> 16) & 0xF;
@@ -490,8 +502,13 @@ Info Decode(bool thumb, u32 num, u32 instr)
         if (data & A_WriteMem)
             res.SpecialKind = special_WriteMem;
 
-        if ((data & A_LoadMem) && res.SrcRegs == (1 << 15))
-            res.SpecialKind = special_LoadLiteral;
+        if (data & A_LoadMem)
+        {
+            if (res.SrcRegs == (1 << 15))
+               res.SpecialKind = special_LoadLiteral;
+            else
+                res.SpecialKind = special_LoadMem;
+        }
         
         if (res.Kind == ak_LDM)
         {
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 6ab4929..a702435 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -232,6 +232,7 @@ enum
 {
     special_NotSpecialAtAll = 0,
     special_WriteMem,
+    special_LoadMem,
     special_WaitForInterrupt,
     special_LoadLiteral
 };
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f35b3e9..84bbc2b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,9 +55,11 @@ if (ENABLE_JIT)
 	enable_language(ASM)
 
 	target_sources(core PRIVATE
-		ARMJIT.cpp
 		ARM_InstrInfo.cpp
 
+		ARMJIT.cpp
+		ARMJIT_Memory.cpp
+
 		dolphin/CommonFuncs.cpp
 	)
 
@@ -85,6 +87,8 @@ if (ENABLE_JIT)
 			ARMJIT_A64/ARMJIT_ALU.cpp
 			ARMJIT_A64/ARMJIT_LoadStore.cpp
 			ARMJIT_A64/ARMJIT_Branch.cpp
+
+			ARMJIT_A64/ARMJIT_Linkage.s
 		)
 	endif()
 endif()
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 225847e..3d64259 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -22,6 +22,7 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 
 // access timing for cached regions
@@ -42,8 +43,8 @@ void ARMv5::CP15Reset()
     DTCMSetting = 0;
     ITCMSetting = 0;
 
-    memset(ITCM, 0, 0x8000);
-    memset(DTCM, 0, 0x4000);
+    memset(ITCM, 0, ITCMPhysicalSize);
+    memset(DTCM, 0, DTCMPhysicalSize);
 
     ITCMSize = 0;
     DTCMBase = 0xFFFFFFFF;
@@ -75,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file)
     file->Var32(&DTCMSetting);
     file->Var32(&ITCMSetting);
 
-    file->VarArray(ITCM, 0x8000);
-    file->VarArray(DTCM, 0x4000);
+    file->VarArray(ITCM, ITCMPhysicalSize);
+    file->VarArray(DTCM, DTCMPhysicalSize);
 
     file->Var32(&PU_CodeCacheable);
     file->Var32(&PU_DataCacheable);
@@ -98,36 +99,30 @@ void ARMv5::CP15DoSavestate(Savestate* file)
 
 void ARMv5::UpdateDTCMSetting()
 {
-#ifdef JIT_ENABLED
-    u32 oldDTCMBase = DTCMBase;
-    u32 oldDTCMSize = DTCMSize;
-#endif
+    u32 newDTCMBase;
+    u32 newDTCMSize;
     if (CP15Control & (1<<16))
     {
-        DTCMBase = DTCMSetting & 0xFFFFF000;
-        DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
+        newDTCMBase = DTCMSetting & 0xFFFFF000;
+        newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F);
         //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize);
     }
     else
     {
-        DTCMBase = 0xFFFFFFFF;
-        DTCMSize = 0;
+        newDTCMBase = 0xFFFFFFFF;
+        newDTCMSize = 0;
         //printf("DTCM disabled\n");
     }
-#ifdef JIT_ENABLED
-    if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize)
+    if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize)
     {
-        ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize);
-        ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize);
+        ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize);
+        DTCMBase = newDTCMBase;
+        DTCMSize = newDTCMSize;
     }
-#endif
 }
 
 void ARMv5::UpdateITCMSetting()
 {
-#ifdef JIT_ENABLED
-    u32 oldITCMSize = ITCMSize;
-#endif
     if (CP15Control & (1<<18))
     {
         ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F);
@@ -138,10 +133,6 @@ void ARMv5::UpdateITCMSetting()
         ITCMSize = 0;
         //printf("ITCM disabled\n");
     }
-#ifdef JIT_ENABLED
-    if (oldITCMSize != ITCMSize)
-        ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize));
-#endif
 }
 
 
@@ -581,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val)
 
     case 0x750:
         ICacheInvalidateAll();
+        //Halt(255);
         return;
     case 0x751:
         ICacheInvalidateByAddr(val);
+        //Halt(255);
         return;
     case 0x752:
         printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val);
+        //Halt(255);
         return;
 
 
@@ -723,7 +717,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch)
     if (addr < ITCMSize)
     {
         CodeCycles = 1;
-        return *(u32*)&ITCM[addr & 0x7FFF];
+        return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
     }
 
     CodeCycles = RegionCodeCycles;
@@ -750,13 +744,13 @@ void ARMv5::DataRead8(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u8*)&ITCM[addr & 0x7FFF];
+        *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -773,13 +767,13 @@ void ARMv5::DataRead16(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u16*)&ITCM[addr & 0x7FFF];
+        *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -796,13 +790,13 @@ void ARMv5::DataRead32(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -817,13 +811,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *val = *(u32*)&ITCM[addr & 0x7FFF];
+        *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)];
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF];
+        *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)];
         return;
     }
 
@@ -838,16 +832,16 @@ void ARMv5::DataWrite8(u32 addr, u8 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u8*)&ITCM[addr & 0x7FFF] = val;
+        *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -864,16 +858,16 @@ void ARMv5::DataWrite16(u32 addr, u16 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u16*)&ITCM[addr & 0x7FFF] = val;
+        *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -890,16 +884,16 @@ void ARMv5::DataWrite32(u32 addr, u32 val)
     if (addr < ITCMSize)
     {
         DataCycles = 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles = 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
@@ -914,16 +908,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val)
     if (addr < ITCMSize)
     {
         DataCycles += 1;
-        *(u32*)&ITCM[addr & 0x7FFF] = val;
+        *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val;
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateITCMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr);
 #endif
         return;
     }
     if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize))
     {
         DataCycles += 1;
-        *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val;
+        *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val;
         return;
     }
 
diff --git a/src/Config.cpp b/src/Config.cpp
index 22e9c11..edf84f2 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -47,8 +47,9 @@ int JIT_LiteralOptimisations = true;
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
 int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = 2;
+int JIT_BrancheOptimisations = true;
 int JIT_LiteralOptimisations = true;
+int JIT_FastMemory = true;
 #endif
 
 ConfigEntry ConfigFile[] =
@@ -72,8 +73,9 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
+    {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 31fa67a..7b19a4b 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -63,6 +63,7 @@ extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
 extern int JIT_BrancheOptimisations;
 extern int JIT_LiteralOptimisations;
+extern int JIT_FastMemory;
 #endif
 
 }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 657241f..3d65482 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -33,6 +33,7 @@
 #include "AREngine.h"
 #include "Platform.h"
 #include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -94,17 +95,17 @@ u32 CPUStop;
 u8 ARM9BIOS[0x1000];
 u8 ARM7BIOS[0x4000];
 
-u8 MainRAM[0x1000000];
+u8* MainRAM;
 u32 MainRAMMask;
 
-u8 SharedWRAM[0x8000];
+u8* SharedWRAM;
 u8 WRAMCnt;
-u8* SWRAM_ARM9;
-u8* SWRAM_ARM7;
-u32 SWRAM_ARM9Mask;
-u32 SWRAM_ARM7Mask;
 
-u8 ARM7WRAM[0x10000];
+// putting them together so they're always next to each other
+MemRegion SWRAM_ARM9;
+MemRegion SWRAM_ARM7;
+
+u8* ARM7WRAM;
 
 u16 ExMemCnt[2];
 
@@ -171,6 +172,10 @@ bool Init()
 
 #ifdef JIT_ENABLED
     ARMJIT::Init();
+#else
+    MainRAM = new u8[MainRAMSize];
+    ARM7WRAM = new u8[ARM7WRAMSize];
+    SharedWRAM = new u8[SharedWRAMSize];
 #endif
 
     DMAs[0] = new DMA(0, 0);
@@ -485,6 +490,10 @@ void Reset()
         printf("ARM7 BIOS loaded\n");
         fclose(f);
     }
+    
+#ifdef JIT_ENABLED
+    ARMJIT::Reset();
+#endif
 
     if (ConsoleType == 1)
     {
@@ -510,7 +519,7 @@ void Reset()
 
     InitTimings();
 
-    memset(MainRAM, 0, 0x1000000);
+    memset(MainRAM, 0, MainRAMMask + 1);
     memset(SharedWRAM, 0, 0x8000);
     memset(ARM7WRAM, 0, 0x10000);
 
@@ -587,10 +596,6 @@ void Reset()
     }
 
     AREngine::Reset();
-
-#ifdef JIT_ENABLED
-    ARMJIT::Reset();
-#endif
 }
 
 void Stop()
@@ -705,7 +710,7 @@ bool DoSavestate(Savestate* file)
 
     file->VarArray(MainRAM, 0x400000);
     file->VarArray(SharedWRAM, 0x8000);
-    file->VarArray(ARM7WRAM, 0x10000);
+    file->VarArray(ARM7WRAM, ARM7WRAMSize);
 
     file->VarArray(ExMemCnt, 2*sizeof(u16));
     file->VarArray(ROMSeed0, 2*8);
@@ -1128,43 +1133,40 @@ void MapSharedWRAM(u8 val)
     if (val == WRAMCnt)
         return;
 
+    ARMJIT_Memory::RemapSWRAM();
+
     WRAMCnt = val;
 
     switch (WRAMCnt & 0x3)
     {
     case 0:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x7FFF;
-        SWRAM_ARM7 = NULL;
-        SWRAM_ARM7Mask = 0;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x7FFF;
+        SWRAM_ARM7.Mem = NULL;
+        SWRAM_ARM7.Mask = 0;
         break;
 
     case 1:
-        SWRAM_ARM9 = &SharedWRAM[0x4000];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 2:
-        SWRAM_ARM9 = &SharedWRAM[0];
-        SWRAM_ARM9Mask = 0x3FFF;
-        SWRAM_ARM7 = &SharedWRAM[0x4000];
-        SWRAM_ARM7Mask = 0x3FFF;
+        SWRAM_ARM9.Mem = &SharedWRAM[0];
+        SWRAM_ARM9.Mask = 0x3FFF;
+        SWRAM_ARM7.Mem = &SharedWRAM[0x4000];
+        SWRAM_ARM7.Mask = 0x3FFF;
         break;
 
     case 3:
-        SWRAM_ARM9 = NULL;
-        SWRAM_ARM9Mask = 0;
-        SWRAM_ARM7 = &SharedWRAM[0];
-        SWRAM_ARM7Mask = 0x7FFF;
+        SWRAM_ARM9.Mem = NULL;
+        SWRAM_ARM9.Mask = 0;
+        SWRAM_ARM7.Mem = &SharedWRAM[0];
+        SWRAM_ARM7.Mask = 0x7FFF;
         break;
     }
-
-#ifdef JIT_ENABLED
-    ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000);
-    ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000);
-#endif
 }
 
 
@@ -1835,12 +1837,12 @@ u8 ARM9Read8(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u8*)&MainRAM[addr & MainRAMMask];
+        return *(u8*)&MainRAM[addr & (MainRAMSize - 1)];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1900,12 +1902,12 @@ u16 ARM9Read16(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u16*)&MainRAM[addr & MainRAMMask];
+        return *(u16*)&MainRAM[addr & (MainRAMSize - 1)];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -1968,9 +1970,9 @@ u32 ARM9Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask];
+            return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask];
         }
         else
         {
@@ -2026,7 +2028,7 @@ void ARM9Write8(u32 addr, u8 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2035,12 +2037,12 @@ void ARM9Write8(u32 addr, u8 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2085,7 +2087,7 @@ void ARM9Write16(u32 addr, u16 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2094,12 +2096,12 @@ void ARM9Write16(u32 addr, u16 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2113,18 +2115,16 @@ void ARM9Write16(u32 addr, u16 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u16>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return;
-        default:
-#ifdef JIT_ENABLED
-            ARMJIT::InvalidateLCDCIfNecessary(addr);
-#endif
-            GPU::WriteVRAM_LCDC<u16>(addr, val);
-            return;
+        default: GPU::WriteVRAM_LCDC<u16>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2165,7 +2165,7 @@ void ARM9Write32(u32 addr, u32 val)
     {
     case 0x02000000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2174,12 +2174,12 @@ void ARM9Write32(u32 addr, u32 val)
         return ;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM9IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val;
+            *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
         return;
 
@@ -2193,18 +2193,16 @@ void ARM9Write32(u32 addr, u32 val)
         return;
 
     case 0x06000000:
+#ifdef JIT_ENABLED
+        ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr);
+#endif
         switch (addr & 0x00E00000)
         {
         case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return;
         case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return;
         case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return;
         case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return;
-        default:
-#ifdef JIT_ENABLED
-            ARMJIT::InvalidateLCDCIfNecessary(addr);
-#endif
-            GPU::WriteVRAM_LCDC<u32>(addr, val);
-            return;
+        default: GPU::WriteVRAM_LCDC<u32>(addr, val); return;
         }
 
     case 0x07000000:
@@ -2250,10 +2248,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
         return true;
 
     case 0x03000000:
-        if (SWRAM_ARM9)
+        if (SWRAM_ARM9.Mem)
         {
-            region->Mem = SWRAM_ARM9;
-            region->Mask = SWRAM_ARM9Mask;
+            region->Mem = SWRAM_ARM9.Mem;
+            region->Mask = SWRAM_ARM9.Mask;
             return true;
         }
         break;
@@ -2292,17 +2290,17 @@ u8 ARM7Read8(u32 addr)
         return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u8*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead8(addr);
@@ -2352,17 +2350,17 @@ u16 ARM7Read16(u32 addr)
         return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u16*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead16(addr);
@@ -2419,17 +2417,17 @@ u32 ARM7Read32(u32 addr)
         return *(u32*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
-            return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask];
+            return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask];
         }
         else
         {
-            return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+            return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
         }
 
     case 0x03800000:
-        return *(u32*)&ARM7WRAM[addr & 0xFFFF];
+        return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)];
 
     case 0x04000000:
         return ARM7IORead32(addr);
@@ -2474,7 +2472,7 @@ void ARM7Write8(u32 addr, u8 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2483,28 +2481,28 @@ void ARM7Write8(u32 addr, u8 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u8*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2514,7 +2512,7 @@ void ARM7Write8(u32 addr, u8 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u8>(addr, val);
         return;
@@ -2551,7 +2549,7 @@ void ARM7Write16(u32 addr, u16 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2560,28 +2558,28 @@ void ARM7Write16(u32 addr, u16 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u16*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2599,7 +2597,7 @@ void ARM7Write16(u32 addr, u16 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u16>(addr, val);
         return;
@@ -2638,7 +2636,7 @@ void ARM7Write32(u32 addr, u32 val)
     case 0x02000000:
     case 0x02800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
 #ifdef JIT_ENABLED
@@ -2647,28 +2645,28 @@ void ARM7Write32(u32 addr, u32 val)
         return;
 
     case 0x03000000:
-        if (SWRAM_ARM7)
+        if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateSWRAM7IfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
 #endif
-            *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val;
+            *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
         }
         else
         {
 #ifdef JIT_ENABLED
-            ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-            *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+            *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
             return;
         }
 
     case 0x03800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr);
 #endif
-        *(u32*)&ARM7WRAM[addr & 0xFFFF] = val;
+        *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val;
         return;
 
     case 0x04000000:
@@ -2687,7 +2685,7 @@ void ARM7Write32(u32 addr, u32 val)
     case 0x06000000:
     case 0x06800000:
 #ifdef JIT_ENABLED
-        ARMJIT::InvalidateARM7WVRAMIfNecessary(addr);
+        ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr);
 #endif
         GPU::WriteVRAM_ARM7<u32>(addr, val);
         return;
@@ -2736,17 +2734,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region)
         // then access all the WRAM as one contiguous block starting at 0x037F8000
         // this case needs a bit of a hack to cover
         // it's not really worth bothering anyway
-        if (!SWRAM_ARM7)
+        if (!SWRAM_ARM7.Mem)
         {
             region->Mem = ARM7WRAM;
-            region->Mask = 0xFFFF;
+            region->Mask = ARM7WRAMSize-1;
             return true;
         }
         break;
 
     case 0x03800000:
         region->Mem = ARM7WRAM;
-        region->Mask = 0xFFFF;
+        region->Mask = ARM7WRAMSize-1;
         return true;
     }
 
diff --git a/src/NDS.h b/src/NDS.h
index e9b56da..4b4f9a1 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -134,6 +134,7 @@ typedef struct
 } MemRegion;
 
 extern int ConsoleType;
+extern int CurCPU;
 
 extern u8 ARM9MemTimings[0x40000][4];
 extern u8 ARM7MemTimings[0x20000][4];
@@ -161,20 +162,20 @@ extern u8 ARM9BIOS[0x1000];
 extern u8 ARM7BIOS[0x4000];
 extern u16 ARM7BIOSProt;
 
-extern u8 MainRAM[0x1000000];
+extern u8* MainRAM;
 extern u32 MainRAMMask;
-extern u8 SharedWRAM[0x8000];
 
-extern u8* SWRAM_ARM9;
-extern u8* SWRAM_ARM7;
-extern u32 SWRAM_ARM9Mask;
-extern u32 SWRAM_ARM7Mask;
-
-extern u8 ARM7WRAM[0x10000];
+const u32 SharedWRAMSize = 0x8000;
+extern u8* SharedWRAM;
 
+extern MemRegion SWRAM_ARM9;
+extern MemRegion SWRAM_ARM7;
 
 extern u32 KeyInput;
 
+const u32 ARM7WRAMSize = 0x10000;
+extern u8* ARM7WRAM;
+
 bool Init();
 void DeInit();
 void Reset();
-- 
cgit v1.2.3


From c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 30 Jun 2020 23:50:41 +0200
Subject: reconcile DSi and JIT, fastmem for x64 and Windows

---
 src/ARM.cpp                                |   23 +-
 src/ARM.h                                  |    2 +-
 src/ARMJIT.cpp                             |  273 +--
 src/ARMJIT.h                               |    2 +
 src/ARMJIT_A64/ARMJIT_LoadStore.cpp        |    4 +-
 src/ARMJIT_Internal.h                      |   12 +-
 src/ARMJIT_Memory.cpp                      |  636 ++++--
 src/ARMJIT_Memory.h                        |   16 +-
 src/ARMJIT_x64/ARMJIT_Compiler.cpp         |  109 +
 src/ARMJIT_x64/ARMJIT_Compiler.h           |   14 +
 src/ARMJIT_x64/ARMJIT_LoadStore.cpp        |  632 +++---
 src/CP15.cpp                               |   21 +
 src/Config.cpp                             |   20 +-
 src/Config.h                               |    9 +-
 src/DSi.cpp                                |  167 +-
 src/DSi.h                                  |   15 +
 src/DSi_I2C.cpp                            |    4 +-
 src/NDS.cpp                                |   41 +-
 src/NDS.h                                  |    2 +
 src/frontend/qt_sdl/EmuSettingsDialog.cpp  |  115 +-
 src/frontend/qt_sdl/EmuSettingsDialog.h    |    5 +-
 src/frontend/qt_sdl/EmuSettingsDialog.ui   |  598 +++---
 src/frontend/qt_sdl/main.cpp               |    9 +-
 src/frontend/qt_sdl/main.h                 |    1 +
 src/libui_sdl/DlgEmuSettings.cpp           |  252 ---
 src/libui_sdl/libui/ui.h                   |  764 -------
 src/libui_sdl/libui/unix/stddialogs.c      |  126 --
 src/libui_sdl/libui/windows/stddialogs.cpp |  180 --
 src/libui_sdl/main.cpp                     | 3061 ----------------------------
 29 files changed, 1656 insertions(+), 5457 deletions(-)
 delete mode 100644 src/libui_sdl/DlgEmuSettings.cpp
 delete mode 100644 src/libui_sdl/libui/ui.h
 delete mode 100644 src/libui_sdl/libui/unix/stddialogs.c
 delete mode 100644 src/libui_sdl/libui/windows/stddialogs.cpp
 delete mode 100644 src/libui_sdl/main.cpp

(limited to 'src/Config.h')

diff --git a/src/ARM.cpp b/src/ARM.cpp
index e529be8..8530795 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -21,12 +21,15 @@
 #include "DSi.h"
 #include "ARM.h"
 #include "ARMInterpreter.h"
-#include "ARMJIT.h"
 #include "Config.h"
 #include "AREngine.h"
 #include "ARMJIT.h"
 #include "Config.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
 
 // instruction timing notes
 //
@@ -109,6 +112,12 @@ void ARM::Reset()
 
     CodeMem.Mem = NULL;
 
+#ifdef JIT_ENABLED
+    FastBlockLookup = NULL;
+    FastBlockLookupStart = 0;
+    FastBlockLookupSize = 0;
+#endif
+
     // zorp
     JumpTo(ExceptionBase);
 }
@@ -752,6 +761,12 @@ void ARMv4::Execute()
 
     if (Halted == 2)
         Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
 }
 
 #ifdef JIT_ENABLED
@@ -820,6 +835,12 @@ void ARMv4::ExecuteJIT()
 
     if (Halted == 2)
         Halted = 0;
+
+    if (Halted == 4)
+    {
+        DSi::SoftReset();
+        Halted = 2;
+    }
 }
 #endif
 
diff --git a/src/ARM.h b/src/ARM.h
index b7f16d6..0248e26 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -147,7 +147,7 @@ public:
     NDS::MemRegion CodeMem;
 
 #ifdef JIT_ENABLED
-    u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0;
+    u32 FastBlockLookupStart, FastBlockLookupSize;
     u64* FastBlockLookup;
 #endif
 
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 53b28c1..2a61c38 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -18,6 +18,7 @@
 #include "ARMInterpreter_Branch.h"
 #include "ARMInterpreter.h"
 
+#include "DSi.h"
 #include "GPU.h"
 #include "GPU3D.h"
 #include "SPU.h"
@@ -38,25 +39,35 @@ namespace ARMJIT
 Compiler* JITCompiler;
 
 AddressRange CodeIndexITCM[ITCMPhysicalSize / 512];
-AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512];
+AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512];
 AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512];
 AddressRange CodeIndexVRAM[0x100000 / 512];
 AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512];
 AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512];
 AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512];
 AddressRange CodeIndexARM7WVRAM[0x40000 / 512];
+AddressRange CodeIndexBIOS9DSi[0x10000 / 512];
+AddressRange CodeIndexBIOS7DSi[0x10000 / 512];
+AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512];
+AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512];
 
 std::unordered_map<u32, JitBlock*> JitBlocks9;
 std::unordered_map<u32, JitBlock*> JitBlocks7;
 
 u64 FastBlockLookupITCM[ITCMPhysicalSize / 2];
-u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2];
+u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2];
 u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2];
 u64 FastBlockLookupVRAM[0x100000 / 2];
 u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2];
 u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2];
 u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2];
 u64 FastBlockLookupARM7WVRAM[0x40000 / 2];
+u64 FastBlockLookupBIOS9DSi[0x10000 / 2];
+u64 FastBlockLookupBIOS7DSi[0x10000 / 2];
+u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2];
+u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2];
 
 const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 {
@@ -64,7 +75,7 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 	ITCMPhysicalSize,
 	0,
 	sizeof(NDS::ARM9BIOS),
-	NDS::MainRAMSize,
+	NDS::MainRAMMaxSize,
 	NDS::SharedWRAMSize,
 	0,
 	0x100000,
@@ -73,6 +84,11 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] =
 	0,
 	0,
 	0x40000,
+	0x10000,
+	0x10000,
+	sizeof(DSi::NWRAM_A),
+	sizeof(DSi::NWRAM_B),
+	sizeof(DSi::NWRAM_C),
 };
 
 AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
@@ -90,6 +106,11 @@ AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] =
 	NULL,
 	NULL,
 	CodeIndexARM7WVRAM,
+	CodeIndexBIOS9DSi,
+	CodeIndexBIOS7DSi,
+	CodeIndexNWRAM_A,
+	CodeIndexNWRAM_B,
+	CodeIndexNWRAM_C
 };
 
 u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
@@ -106,7 +127,12 @@ u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] =
 	FastBlockLookupARM7WRAM,
 	NULL,
 	NULL,
-	FastBlockLookupARM7WVRAM
+	FastBlockLookupARM7WVRAM,
+	FastBlockLookupBIOS9DSi,
+	FastBlockLookupBIOS7DSi,
+	FastBlockLookupNWRAM_A,
+	FastBlockLookupNWRAM_B,
+	FastBlockLookupNWRAM_C
 };
 
 u32 LocaliseCodeAddress(u32 num, u32 addr)
@@ -115,21 +141,14 @@ u32 LocaliseCodeAddress(u32 num, u32 addr)
 		? ARMJIT_Memory::ClassifyAddress9(addr)
 		: ARMJIT_Memory::ClassifyAddress7(addr);
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
-		mappingSize, memoryOffset, memorySize)
-		&& CodeMemRegions[region])
-	{
-		addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
-		addr |= (u32)region << 28;
-		return addr;
-	}
+	if (CodeMemRegions[region])
+		return ARMJIT_Memory::LocaliseAddress(region, num, addr);
 	return 0;
 }
 
 TinyVector<u32> InvalidLiterals;
 
-template <typename T>
+template <typename T, int ConsoleType>
 T SlowRead9(u32 addr, ARMv5* cpu)
 {
 	u32 offset = addr & 0x3;
@@ -141,11 +160,11 @@ T SlowRead9(u32 addr, ARMv5* cpu)
 	else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize))
 		val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF];
 	else if (std::is_same<T, u32>::value)
-		val = NDS::ARM9Read32(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr);
 	else if (std::is_same<T, u16>::value)
-		val = NDS::ARM9Read16(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr);
 	else
-		val = NDS::ARM9Read8(addr);
+		val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr);
 
 	if (std::is_same<T, u32>::value)
 		return ROR(val, offset << 3);
@@ -153,7 +172,7 @@ T SlowRead9(u32 addr, ARMv5* cpu)
 		return val;
 }
 
-template <typename T>
+template <typename T, int ConsoleType>
 void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 {
 	addr &= ~(sizeof(T) - 1);
@@ -169,27 +188,19 @@ void SlowWrite9(u32 addr, ARMv5* cpu, T val)
 	}
 	else if (std::is_same<T, u32>::value)
 	{
-		NDS::ARM9Write32(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val);
 	}
 	else if (std::is_same<T, u16>::value)
 	{
-		NDS::ARM9Write16(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val);
 	}
 	else
 	{
-		NDS::ARM9Write8(addr, val);
+		(ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val);
 	}
 }
 
-template void SlowWrite9<u32>(u32, ARMv5*, u32);
-template void SlowWrite9<u16>(u32, ARMv5*, u16);
-template void SlowWrite9<u8>(u32, ARMv5*, u8);
-
-template u32 SlowRead9<u32>(u32, ARMv5*);
-template u16 SlowRead9<u16>(u32, ARMv5*);
-template u8 SlowRead9<u8>(u32, ARMv5*);
-
-template <typename T>
+template <typename T, int ConsoleType>
 T SlowRead7(u32 addr)
 {
 	u32 offset = addr & 0x3;
@@ -197,11 +208,11 @@ T SlowRead7(u32 addr)
 
 	T val;
 	if (std::is_same<T, u32>::value)
-		val = NDS::ARM7Read32(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr);
 	else if (std::is_same<T, u16>::value)
-		val = NDS::ARM7Read16(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr);
 	else
-		val = NDS::ARM7Read8(addr);
+		val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr);
 
 	if (std::is_same<T, u32>::value)
 		return ROR(val, offset << 3);
@@ -209,67 +220,71 @@ T SlowRead7(u32 addr)
 		return val;
 }
 
-template <typename T>
+template <typename T, int ConsoleType>
 void SlowWrite7(u32 addr, T val)
 {
 	addr &= ~(sizeof(T) - 1);
 
 	if (std::is_same<T, u32>::value)
-		NDS::ARM7Write32(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val);
 	else if (std::is_same<T, u16>::value)
-		NDS::ARM7Write16(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val);
 	else
-		NDS::ARM7Write8(addr, val);
+		(ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val);
 }
 
-template <bool PreInc, bool Write>
+template <bool Write, int ConsoleType>
 void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu)
 {
 	addr &= ~0x3;
-	if (PreInc)
-		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
 		if (Write)
-			SlowWrite9<u32>(addr, cpu, data[i]);
+			SlowWrite9<u32, ConsoleType>(addr, cpu, data[i]);
 		else
-			data[i] = SlowRead9<u32>(addr, cpu);
+			data[i] = SlowRead9<u32, ConsoleType>(addr, cpu);
 		addr += 4;
 	}
 }
 
-template <bool PreInc, bool Write>
+template <bool Write, int ConsoleType>
 void SlowBlockTransfer7(u32 addr, u64* data, u32 num)
 {
 	addr &= ~0x3;
-	if (PreInc)
-		addr += 4;
 	for (int i = 0; i < num; i++)
 	{
 		if (Write)
-			SlowWrite7<u32>(addr, data[i]);
+			SlowWrite7<u32, ConsoleType>(addr, data[i]);
 		else
-			data[i] = SlowRead7<u32>(addr);
+			data[i] = SlowRead7<u32, ConsoleType>(addr);
 		addr += 4;
 	}
 }
 
-template void SlowWrite7<u32>(u32, u32);
-template void SlowWrite7<u16>(u32, u16);
-template void SlowWrite7<u8>(u32, u8);
-
-template u32 SlowRead7<u32>(u32);
-template u16 SlowRead7<u16>(u32);
-template u8 SlowRead7<u8>(u32);
-
-template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*);
-template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num);
-template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num);
+#define INSTANTIATE_SLOWMEM(consoleType) \
+	template void SlowWrite9<u32, consoleType>(u32, ARMv5*, u32); \
+	template void SlowWrite9<u16, consoleType>(u32, ARMv5*, u16); \
+	template void SlowWrite9<u8, consoleType>(u32, ARMv5*, u8); \
+	\
+	template u32 SlowRead9<u32, consoleType>(u32, ARMv5*); \
+	template u16 SlowRead9<u16, consoleType>(u32, ARMv5*); \
+	template u8 SlowRead9<u8, consoleType>(u32, ARMv5*); \
+	\
+	template void SlowWrite7<u32, consoleType>(u32, u32); \
+	template void SlowWrite7<u16, consoleType>(u32, u16); \
+	template void SlowWrite7<u8, consoleType>(u32, u8); \
+	\
+	template u32 SlowRead7<u32, consoleType>(u32); \
+	template u16 SlowRead7<u16, consoleType>(u32); \
+	template u8 SlowRead7<u8, consoleType>(u32); \
+	\
+	template void SlowBlockTransfer9<false, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer9<true, consoleType>(u32, u64*, u32, ARMv5*); \
+	template void SlowBlockTransfer7<false, consoleType>(u32 addr, u64* data, u32 num); \
+	template void SlowBlockTransfer7<true, consoleType>(u32 addr, u64* data, u32 num); \
+
+INSTANTIATE_SLOWMEM(0)
+INSTANTIATE_SLOWMEM(1)
 
 template <typename K, typename V, int Size, V InvalidValue>
 struct UnreliableHashTable
@@ -616,6 +631,12 @@ void CompileBlock(ARM* cpu)
 
 	u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4);
 
+	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
+	if (!localAddr)
+	{
+		printf("trying to compile non executable code? %x\n", blockAddr);
+	}
+
 	auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7;
 	auto existingBlockIt = map.find(blockAddr);
 	if (existingBlockIt != map.end())
@@ -623,18 +644,24 @@ void CompileBlock(ARM* cpu)
 		// there's already a block, though it's not inside the fast map
 		// could be that there are two blocks at the same physical addr
 		// but different mirrors
-		u32 localAddr = existingBlockIt->second->StartAddrLocal;
+		u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal;
 
-		u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF];
-		*entry = ((u64)blockAddr | cpu->Num) << 32;
-		*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
-		return;
-	}
+		if (localAddr == otherLocalAddr)
+		{
+			JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr);
 
-	u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr);
-	if (!localAddr)
-	{
-		printf("trying to compile non executable code? %x\n", blockAddr);
+			u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2];
+			*entry = ((u64)blockAddr | cpu->Num) << 32;
+			*entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint);
+			return;
+		}
+
+		// some memory has been remapped
+		JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second);
+		if (prevBlock)
+			delete prevBlock;
+		
+		map.erase(existingBlockIt);
 	}
 
     FetchedInstr instrs[Config::JIT_MaxBlockSize];
@@ -655,7 +682,7 @@ void CompileBlock(ARM* cpu)
     u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]};
 	u32 nextInstrAddr[2] = {blockAddr, r15};
 
-	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr);
+	JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr);
 
 	u32 lastSegmentStart = blockAddr;
 	u32 lr;
@@ -678,7 +705,7 @@ void CompileBlock(ARM* cpu)
 		instrValues[i] = instrs[i].Instr;
 
 		u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr);
-		assert(translatedAddr);
+		assert(translatedAddr >> 27);
 		u32 translatedAddrRounded = translatedAddr & ~0x1FF;
 		if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1])
 		{
@@ -727,7 +754,10 @@ void CompileBlock(ARM* cpu)
 		cpu->CurInstr = instrs[i].Instr;
 		cpu->CodeCycles = instrs[i].CodeCycles;
 
-		if (instrs[i].Info.DstRegs & (1 << 14))
+		if (instrs[i].Info.DstRegs & (1 << 14)
+			|| (!thumb
+				&& (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG)
+				&& instrs[i].Instr & (1 << 16)))
 			hasLink = false;
 
 		if (thumb)
@@ -792,7 +822,7 @@ void CompileBlock(ARM* cpu)
 			i--;
 		}
 
-		if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations)
+		if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations)
 		{
 			bool hasBranched = cpu->R[15] != r15;
 
@@ -830,8 +860,6 @@ void CompileBlock(ARM* cpu)
 				}
 				else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize)
 				{
-					u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target);
-
 					if (link)
 					{
 						lr = linkAddr;
@@ -927,6 +955,8 @@ void CompileBlock(ARM* cpu)
 		FloodFillSetFlags(instrs, i - 1, 0xF);
 
 		block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i);
+
+		JIT_DEBUGPRINT("block start %p\n", block->EntryPoint);
 	}
 	else
 	{
@@ -940,12 +970,12 @@ void CompileBlock(ARM* cpu)
 		assert(addressMasks[j] == block->AddressMasks()[j]);
 		assert(addressMasks[j] != 0);
 
-		AddressRange* region = CodeMemRegions[addressRanges[j] >> 28];
+		AddressRange* region = CodeMemRegions[addressRanges[j] >> 27];
 
-		if (!PageContainsCode(&region[(addressRanges[j] & 0xFFFF000) / 512]))
-			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true);
+		if (!PageContainsCode(&region[(addressRanges[j] & 0x7FFF000) / 512]))
+			ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true);
 
-		AddressRange* range = &region[(addressRanges[j] & 0xFFFFFFF) / 512];
+		AddressRange* range = &region[(addressRanges[j] & 0x7FFFFFF) / 512];
 		range->Code |= addressMasks[j];
 		range->Blocks.Add(block);
 	}
@@ -955,7 +985,7 @@ void CompileBlock(ARM* cpu)
 	else
 		JitBlocks7[blockAddr] = block;
 
-	u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2];
+	u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2];
 	*entry = ((u64)blockAddr | cpu->Num) << 32;
 	*entry |= JITCompiler->SubEntryOffset(block->EntryPoint);
 }
@@ -964,8 +994,8 @@ void InvalidateByAddr(u32 localAddr)
 {
 	JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr);
 
-	AddressRange* region = CodeMemRegions[localAddr >> 28];
-	AddressRange* range = &region[(localAddr & 0xFFFFFFF) / 512];
+	AddressRange* region = CodeMemRegions[localAddr >> 27];
+	AddressRange* range = &region[(localAddr & 0x7FFFFFF) / 512];
 	u32 mask = 1 << ((localAddr & 0x1FF) / 16);
 
 	range->Code = 0;
@@ -994,9 +1024,9 @@ void InvalidateByAddr(u32 localAddr)
 		range->Blocks.Remove(i);
 
 		if (range->Blocks.Length == 0
-			&& !PageContainsCode(&region[(localAddr & 0xFFFF000) / 512]))
+			&& !PageContainsCode(&region[(localAddr & 0x7FFF000) / 512]))
 		{
-			ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false);
+			ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false);
 		}
 
 		bool literalInvalidation = false;
@@ -1019,8 +1049,8 @@ void InvalidateByAddr(u32 localAddr)
 			u32 addr = block->AddressRanges()[j];
 			if ((addr / 512) != (localAddr / 512))
 			{
-				AddressRange* otherRegion = CodeMemRegions[addr >> 28];
-				AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512];
+				AddressRange* otherRegion = CodeMemRegions[addr >> 27];
+				AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512];
 				assert(otherRange != range);
 
 				bool removed = otherRange->Blocks.RemoveByValue(block);
@@ -1028,15 +1058,15 @@ void InvalidateByAddr(u32 localAddr)
 
 				if (otherRange->Blocks.Length == 0)
 				{
-					if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512]))
-						ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false);
+					if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512]))
+						ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false);
 
 					otherRange->Code = 0;
 				}
 			}
 		}
 
-		FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32;
+		FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32;
 		if (block->Num == 0)
 			JitBlocks9.erase(block->StartAddr);
 		else
@@ -1055,19 +1085,25 @@ void InvalidateByAddr(u32 localAddr)
 	}
 }
 
-template <u32 num, int region>
-void CheckAndInvalidate(u32 addr)
+void CheckAndInvalidateITCM()
 {
-	// let's hope this gets all properly inlined
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize))
+	for (u32 i = 0; i < ITCMPhysicalSize; i+=16)
 	{
-		u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset;
-		if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
-			InvalidateByAddr(localAddr | (region << 28));
+		if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16)))
+		{
+			InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27));
+		}
 	}
 }
 
+template <u32 num, int region>
+void CheckAndInvalidate(u32 addr)
+{
+	u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr);
+	if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16)))
+		InvalidateByAddr(localAddr);
+}
+
 JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
 {
 	u64* entry = &entries[offset / 2];
@@ -1076,35 +1112,44 @@ JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr)
 	return NULL;
 }
 
+void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry)
+{
+	u32 localAddr = LocaliseCodeAddress(num, blockAddr);
+	assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry);
+}
+
 bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size)
 {
+	// amazingly ignoring the DTCM is the proper behaviour for code fetches
 	int region = num == 0
 		? ARMJIT_Memory::ClassifyAddress9(blockAddr)
 		: ARMJIT_Memory::ClassifyAddress7(blockAddr);
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	if (CodeMemRegions[region]
-		&& ARMJIT_Memory::GetRegionMapping(region, num, mappingStart,
-			mappingSize, memoryOffset, memorySize))
+	u32 memoryOffset;
+	if (FastBlockLookupRegions[region]
+		&& ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size))
 	{
+		//printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset);
 		entry = FastBlockLookupRegions[region] + memoryOffset / 2;
-		// evil, though it should work for everything except DTCM which is not relevant here
-		start = blockAddr & ~(memorySize - 1);
-		size = memorySize;
 		return true;
 	}
-	else
-		return false;
+	return false;
 }
 
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32);
-template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32);
-template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32);
 template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32);
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32);
 template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32);
+template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
+template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32);
 
 void ResetBlockCache()
 {
@@ -1133,7 +1178,7 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
 			range->Blocks.Clear();
 			range->Code = 0;
 		}
@@ -1145,7 +1190,7 @@ void ResetBlockCache()
 		for (int j = 0; j < block->NumAddresses; j++)
 		{
 			u32 addr = block->AddressRanges()[j];
-			AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512];
+			AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512];
 			range->Blocks.Clear();
 			range->Code = 0;
 		}
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 2320b7b..04add59 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -16,6 +16,8 @@ void DeInit();
 
 void Reset();
 
+void CheckAndInvalidateITCM();
+
 void InvalidateByAddr(u32 pseudoPhysical);
 
 template <u32 num, int region>
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index b307d0e..c1b23a7 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -168,7 +168,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
         ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
         : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
 
-    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget)))
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
     {
         ptrdiff_t memopStart = GetCodeOffset();
         LoadStorePatch patch;
@@ -461,7 +461,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
     bool compileFastPath = Config::JIT_FastMemory
-        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget));
+        && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
 
     if (decrement)
     {
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 19684c4..c87e1b3 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -214,13 +214,13 @@ u32 LocaliseCodeAddress(u32 num, u32 addr);
 template <u32 Num>
 void LinkBlock(ARM* cpu, u32 codeOffset);
 
-template <typename T> T SlowRead9(u32 addr, ARMv5* cpu);
-template <typename T> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
-template <typename T> T SlowRead7(u32 addr);
-template <typename T> void SlowWrite7(u32 addr, T val);
+template <typename T, int ConsoleType> T SlowRead9(u32 addr, ARMv5* cpu);
+template <typename T, int ConsoleType> void SlowWrite9(u32 addr, ARMv5* cpu, T val);
+template <typename T, int ConsoleType> T SlowRead7(u32 addr);
+template <typename T, int ConsoleType> void SlowWrite7(u32 addr, T val);
 
-template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
-template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
+template <bool Write, int ConsoleType> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu);
+template <bool Write, int ConsoleType> void SlowBlockTransfer7(u32 addr, u64* data, u32 num);
 
 }
 
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
index 162827d..0276c65 100644
--- a/src/ARMJIT_Memory.cpp
+++ b/src/ARMJIT_Memory.cpp
@@ -1,5 +1,7 @@
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 #include "switch/compat_switch.h"
+#elif defined(_WIN32)
+#include <windows.h>
 #endif
 
 #include "ARMJIT_Memory.h"
@@ -7,6 +9,7 @@
 #include "ARMJIT_Internal.h"
 #include "ARMJIT_Compiler.h"
 
+#include "DSi.h"
 #include "GPU.h"
 #include "GPU3D.h"
 #include "Wifi.h"
@@ -37,66 +40,24 @@
 
 namespace ARMJIT_Memory
 {
-#ifdef __aarch64__
-struct FaultDescription
-{
-	u64 IntegerRegisters[33];
-	u64 FaultAddr;
-
-	u32 GetEmulatedAddr()
-	{
-		// now this is podracing
-		return (u32)IntegerRegisters[0];
-	}
-	u64 RealAddr()
-	{
-		return FaultAddr;
-	}
-
-	u64 GetPC()
-	{
-		return IntegerRegisters[32];
-	}
-
-	void RestoreAndRepeat(s64 offset);
-};
-#else
 struct FaultDescription
 {
-	u64 GetPC()
-	{
-		return 0;
-	}
-	
-	u32 GetEmulatedAddr()
-	{
-		return 0;
-	}
-	u64 RealAddr()
-	{
-		return 0;
-	}
-
-	void RestoreAndRepeat(s64 offset);
+	u32 EmulatedFaultAddr;
+	u64 FaultPC;
 };
-#endif
 
-void FaultHandler(FaultDescription* faultDesc);
+bool FaultHandler(FaultDescription* faultDesc, s32& offset);
 }
 
-
-#ifdef __aarch64__
-
-extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
-
-#endif
-
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 // with LTO the symbols seem to be not properly overriden
 // if they're somewhere else
 
 extern "C"
 {
+	
+void ARM_RestoreContext(u64* registers) __attribute__((noreturn));
+
 extern char __start__;
 extern char __rodata_start;
 
@@ -106,57 +67,85 @@ u64 __nx_exception_stack_size = 0x8000;
 void __libnx_exception_handler(ThreadExceptionDump* ctx)
 {
 	ARMJIT_Memory::FaultDescription desc;
-	memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29);
-	desc.IntegerRegisters[29] = ctx->fp.x;
-	desc.IntegerRegisters[30] = ctx->lr.x;
-	desc.IntegerRegisters[31] = ctx->sp.x;
-	desc.IntegerRegisters[32] = ctx->pc.x;
+	desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w;
+	desc.FaultPC = ctx->pc.x;
+
+	u64 integerRegisters[33];
+	memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29);
+	integerRegisters[29] = ctx->fp.x;
+	integerRegisters[30] = ctx->lr.x;
+	integerRegisters[31] = ctx->sp.x;
+	integerRegisters[32] = ctx->pc.x;
+
+	s32 offset;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		integerRegisters[32] += offset;
 
-	ARMJIT_Memory::FaultHandler(&desc);
+		ARM_RestoreContext(integerRegisters);	
+	}
 
 	if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start)
 	{
-		printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
+		printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", 
 			ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x);
 	}
 	else
 	{
-		printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
+		printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc);
 	}
 }
 
 }
+
+#elif defined(_WIN32)
+
+static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo)
+{
+	if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION)
+		return EXCEPTION_CONTINUE_SEARCH;
+
+	ARMJIT_Memory::FaultDescription desc;
+	desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx;
+	desc.FaultPC = exceptionInfo->ContextRecord->Rip;
+
+	s32 offset = 0;
+	if (ARMJIT_Memory::FaultHandler(&desc, offset))
+	{
+		exceptionInfo->ContextRecord->Rip += offset;
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+
+	return EXCEPTION_CONTINUE_SEARCH;
+}
+
 #endif
 
 namespace ARMJIT_Memory
 {
 
-#ifdef __aarch64__
-void FaultDescription::RestoreAndRepeat(s64 offset)
-{
-	IntegerRegisters[32] += offset;
+void* FastMem9Start, *FastMem7Start;
 
-	ARM_RestoreContext(IntegerRegisters);
+#ifdef _WIN32
+inline u32 RoundUp(u32 size)
+{
+	return (size + 0xFFFF) & ~0xFFFF;
 }
 #else
-void FaultDescription::RestoreAndRepeat(s64 offset)
+inline u32 RoundUp(u32 size)
 {
-	
+	return size;
 }
 #endif
 
-void* FastMem9Start, *FastMem7Start;
-
-const u32 MemoryTotalSize =
-	NDS::MainRAMSize
-	+ NDS::SharedWRAMSize
-	+ NDS::ARM7WRAMSize
-	+ DTCMPhysicalSize;
-
 const u32 MemBlockMainRAMOffset = 0;
-const u32 MemBlockSWRAMOffset = NDS::MainRAMSize;
-const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize;
-const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize;
+const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize);
+const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize);
+const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize);
+const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize);
+const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize);
+const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize);
 
 const u32 OffsetsPerRegion[memregions_Count] =
 {
@@ -173,6 +162,11 @@ const u32 OffsetsPerRegion[memregions_Count] =
 	UINT32_MAX,
 	UINT32_MAX,
 	UINT32_MAX,
+	UINT32_MAX,
+	UINT32_MAX,
+	MemBlockNWRAM_AOffset,
+	MemBlockNWRAM_BOffset,
+	MemBlockNWRAM_COffset
 };
 
 enum
@@ -186,11 +180,13 @@ enum
 u8 MappingStatus9[1 << (32-12)];
 u8 MappingStatus7[1 << (32-12)];
 
-#ifdef __SWITCH__
+#if defined(__SWITCH__)
 u8* MemoryBase;
 u8* MemoryBaseCodeMem;
-#else
+#elif defined(_WIN32)
 u8* MemoryBase;
+HANDLE MemoryFile;
+LPVOID ExceptionHandlerHandle;
 #endif
 
 bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
@@ -200,6 +196,9 @@ bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size)
 	Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), 
 		(u64)(MemoryBaseCodeMem + offset), size));
 	return R_SUCCEEDED(r);
+#elif defined(_WIN32)
+	bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst;
+	return r;
 #endif
 }
 
@@ -209,8 +208,24 @@ bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size)
 #ifdef __SWITCH__
 	Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(),
 		(u64)(MemoryBaseCodeMem + offset), size);
-	printf("%x\n", r);
 	return R_SUCCEEDED(r);
+#else
+	return UnmapViewOfFile(dst);
+#endif
+}
+
+void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection)
+{
+	u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr;
+#if defined(_WIN32)
+	DWORD winProtection, oldProtection;
+	if (protection == 0)
+		winProtection = PAGE_NOACCESS;
+	else if (protection == 1)
+		winProtection = PAGE_READONLY;
+	else
+		winProtection = PAGE_READWRITE;
+	VirtualProtect(dst, size, winProtection, &oldProtection);
 #endif
 }
 
@@ -230,7 +245,6 @@ struct Mapping
 			if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase)
 			{
 				offset += NDS::ARM9->DTCMSize;
-				printf("%x skip\n", NDS::ARM9->DTCMSize);
 			}
 			else
 			{
@@ -245,6 +259,7 @@ struct Mapping
 					offset += 0x1000;
 				}
 
+#ifdef __SWITCH__
 				if (status == memstate_MappedRW)
 				{
 					u32 segmentSize = offset - segmentOffset;
@@ -252,8 +267,12 @@ struct Mapping
 					bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize);
 					assert(success);
 				}
+#endif
 			}
 		}
+#if defined(_WIN32)
+		UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size);
+#endif
 	}
 };
 ARMJIT::TinyVector<Mapping> Mappings[memregions_Count];
@@ -268,6 +287,8 @@ void SetCodeProtection(int region, u32 offset, bool protect)
 		Mapping& mapping = Mappings[region][i];
 
 		u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset);
+		if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size)
+			continue;
 		if (mapping.Num == 0
 			&& region != memregion_DTCM 
 			&& effectiveAddr >= NDS::ARM9->DTCMBase
@@ -276,16 +297,20 @@ void SetCodeProtection(int region, u32 offset, bool protect)
 
 		u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7);
 
-		printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num);
+		printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]);
 		assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected));
 		states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW;
 
+#if defined(__SWITCH__)
 		bool success;
 		if (protect)
 			success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
 		else
 			success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000);
 		assert(success);
+#elif defined(_WIN32)
+		SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2);
+#endif
 	}
 }
 
@@ -314,8 +339,8 @@ void RemapDTCM(u32 newBase, u32 newSize)
 
 			printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset);
 
-			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end));
-			bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end));
+			bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start);
+			bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start);
 
 			if (mapping.Num == 0 && (oldOverlap || newOverlap))
 			{
@@ -336,19 +361,50 @@ void RemapDTCM(u32 newBase, u32 newSize)
 	Mappings[memregion_DTCM].Clear();
 }
 
+void RemapNWRAM(int num)
+{
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;)
+	{
+		Mapping& mapping = Mappings[memregion_SharedWRAM][i];
+		if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size
+			|| DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr))
+		{
+			mapping.Unmap(memregion_SharedWRAM);
+			Mappings[memregion_SharedWRAM].Remove(i);
+		}
+		else
+		{
+			i++;
+		}
+	}
+	for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++)
+	{
+		Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num);
+	}
+	Mappings[memregion_NewSharedWRAM_A + num].Clear();
+}
+
 void RemapSWRAM()
 {
 	printf("remapping SWRAM\n");
-	for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++)
+	for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++)
 	{
-		Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM);
+		Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM);
 	}
-	Mappings[memregion_SWRAM].Clear();
+	Mappings[memregion_SharedWRAM].Clear();
 	for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++)
 	{
 		Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7);
 	}
 	Mappings[memregion_WRAM7].Clear();
+	for (int j = 0; j < 3; j++)
+	{
+		for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++)
+		{
+			Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j);	
+		}
+		Mappings[memregion_NewSharedWRAM_A + j].Clear();
+	}
 }
 
 bool MapAtAddress(u32 addr)
@@ -359,33 +415,36 @@ bool MapAtAddress(u32 addr)
 		? ClassifyAddress9(addr)
 		: ClassifyAddress7(addr);
 
-	if (!IsMappable(region))
+	if (!IsFastmemCompatible(region))
 		return false;
 
-	u32 mappingStart, mappingSize, memoryOffset, memorySize;
-	bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize);
+	return false;
 
+	u32 mirrorStart, mirrorSize, memoryOffset;
+	bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize);
 	if (!isMapped)
 		return false;
 
-	// this calculation even works with DTCM
-	// which doesn't have to be aligned to it's own size
-	u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart;
-
 	u8* states = num == 0 ? MappingStatus9 : MappingStatus7;
-	printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset);
+	printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num);
 	bool isExecutable = ARMJIT::CodeMemRegions[region];
 
-	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset;
+#if defined(_WIN32)
+	bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize);
+	assert(succeded);
+#endif
+
+	ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512;
 
 	// this overcomplicated piece of code basically just finds whole pieces of code memory
 	// which can be mapped
 	u32 offset = 0;	
 	bool skipDTCM = num == 0 && region != memregion_DTCM;
-	while (offset < memorySize)
+	while (offset < mirrorSize)
 	{
 		if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase)
 		{
+			SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0);
 			offset += NDS::ARM9->DTCMSize;
 		}
 		else
@@ -393,7 +452,7 @@ bool MapAtAddress(u32 addr)
 			u32 sectionOffset = offset;
 			bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]);
 			while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode)
-				&& offset < memorySize
+				&& offset < mirrorSize
 				&& (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase))
 			{
 				assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped);
@@ -403,41 +462,49 @@ bool MapAtAddress(u32 addr)
 
 			u32 sectionSize = offset - sectionOffset;
 
+#if defined(__SWITCH__)
 			if (!hasCode)
 			{
 				printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]);
 				bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize);
 				assert(succeded);
 			}
+#elif defined(_WIN32)
+			if (hasCode)
+			{
+				SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1);
+			}
+#endif
 		}
 	}
 
-	Mapping mapping{mirrorStart, memorySize, memoryOffset, num};
+	assert(num == 0 || num == 1);
+	Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num};
 	Mappings[region].Add(mapping);
 
-	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1);
+	printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1);
 
 	return true;
 }
 
-void FaultHandler(FaultDescription* faultDesc)
+bool FaultHandler(FaultDescription* faultDesc, s32& offset)
 {
-	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC()))
+	if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC))
 	{
 		bool rewriteToSlowPath = true;
 
-		u32 addr = faultDesc->GetEmulatedAddr();
+		u32 addr = faultDesc->EmulatedFaultAddr;
 
 		if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped)
-			rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr());
+			rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr);
 
-		s64 offset = 0;
 		if (rewriteToSlowPath)
 		{
-			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC());
+			offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC);
 		}
-		faultDesc->RestoreAndRepeat(offset);
+		return true;
 	}
+	return false;
 }
 
 void Init()
@@ -459,18 +526,34 @@ void Init()
 	FastMem7Start = virtmemReserve(0x100000000);
 	assert(FastMem7Start);
 
-	NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset;
-	NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset;
-	NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset;
-	NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset;
-#else
-	MemoryBase = new u8[MemoryTotalSize];
+	u8* basePtr = MemoryBaseCodeMem;
+#elif defined(_WIN32)
+	ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler);
+
+	MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL);
 
-	NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset;
-	NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset;
-	NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset;
-	NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset;
+	MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE);
+
+	FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+	FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE);
+
+	// only free them after they have all been reserved
+	// so they can't overlap
+	VirtualFree(MemoryBase, 0, MEM_RELEASE);
+	VirtualFree(FastMem9Start, 0, MEM_RELEASE);
+	VirtualFree(FastMem7Start, 0, MEM_RELEASE);
+
+	MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase);
+
+	u8* basePtr = MemoryBase;
 #endif
+	NDS::MainRAM = basePtr + MemBlockMainRAMOffset;
+	NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset;
+	NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset;
+	NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset;
+	DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset;
+	DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset;
+	DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset;
 }
 
 void DeInit()
@@ -482,8 +565,11 @@ void DeInit()
     svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize);
 	virtmemFree(MemoryBaseCodeMem, MemoryTotalSize);
     free(MemoryBase);
-#else
-	delete[] MemoryBase;
+#elif defined(_WIN32)
+	assert(UnmapViewOfFile(MemoryBase));
+	CloseHandle(MemoryFile);
+
+	RemoveVectoredExceptionHandler(ExceptionHandlerHandle);
 #endif
 }
 
@@ -505,12 +591,23 @@ void Reset()
 	printf("done resetting jit mem\n");
 }
 
-bool IsMappable(int region)
+bool IsFastmemCompatible(int region)
 {
+#ifdef _WIN32
+	/*
+		TODO: with some hacks, the smaller shared WRAM regions
+		could be mapped in some occaisons as well
+	*/
+	if (region == memregion_DTCM 
+		|| region == memregion_SharedWRAM
+		|| region == memregion_NewSharedWRAM_B
+		|| region == memregion_NewSharedWRAM_C)
+		return false;
+#endif
 	return OffsetsPerRegion[region] != UINT32_MAX;
 }
 
-bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize)
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize)
 {
 	memoryOffset = 0;
 	switch (region)
@@ -518,137 +615,251 @@ bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize,
 	case memregion_ITCM:
 		if (num == 0)
 		{
-			mappingStart = 0;
-			mappingSize = NDS::ARM9->ITCMSize;
-			memorySize = ITCMPhysicalSize;
+			mirrorStart = addr & ~(ITCMPhysicalSize - 1);
+			mirrorSize = ITCMPhysicalSize;
 			return true;
 		}
 		return false;
-	case memregion_DTCM:
+	case memregion_MainRAM:
+		mirrorStart = addr & ~NDS::MainRAMMask;
+		mirrorSize = NDS::MainRAMMask + 1;
+		return true;
+	case memregion_BIOS9:
 		if (num == 0)
 		{
-			mappingStart = NDS::ARM9->DTCMBase;
-			mappingSize = NDS::ARM9->DTCMSize;
-			memorySize = DTCMPhysicalSize;
+			mirrorStart = addr & ~0xFFF;
+			mirrorSize = 0x1000;
 			return true;
 		}
 		return false;
-	case memregion_BIOS9:
-		if (num == 0)
+	case memregion_BIOS7:
+		if (num == 1)
 		{
-			mappingStart = 0xFFFF0000;
-			mappingSize = 0x10000;
-			memorySize = 0x1000;
+			mirrorStart = 0;
+			mirrorSize = 0x4000;
 			return true;
 		}
 		return false;
-	case memregion_MainRAM:
-		mappingStart = 0x2000000;
-		mappingSize = 0x1000000;
-		memorySize = NDS::MainRAMSize;
-		return true;
-	case memregion_SWRAM:
-		mappingStart = 0x3000000;
+	case memregion_SharedWRAM:
 		if (num == 0 && NDS::SWRAM_ARM9.Mem)
 		{
-			mappingSize = 0x1000000;
+			mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask;
+			mirrorSize = NDS::SWRAM_ARM9.Mask + 1;
 			memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM;
-			memorySize = NDS::SWRAM_ARM9.Mask + 1;
 			return true;
 		}
 		else if (num == 1 && NDS::SWRAM_ARM7.Mem)
 		{
-			mappingSize = 0x800000;
+			mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask;
+			mirrorSize = NDS::SWRAM_ARM7.Mask + 1;
 			memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM;
-			memorySize = NDS::SWRAM_ARM7.Mask + 1;
+			return true;
+		}
+		return false;
+	case memregion_WRAM7:
+		if (num == 1)
+		{
+			mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1);
+			mirrorSize = NDS::ARM7WRAMSize;
 			return true;
 		}
 		return false;
 	case memregion_VRAM:
 		if (num == 0)
 		{
-			// this is a gross simplification
-			// mostly to make code on vram working
-			// it doesn't take any of the actual VRAM mappings into account
-			mappingStart = 0x6000000;
-			mappingSize = 0x1000000;
-			memorySize = 0x100000;
-			return true;
+			mirrorStart = addr & ~0xFFFFF;
+			mirrorSize = 0x100000;
 		}
 		return false;
-	case memregion_BIOS7:
+	case memregion_VWRAM:
 		if (num == 1)
 		{
-			mappingStart = 0;
-			mappingSize = 0x4000;
-			memorySize = 0x4000;
+			mirrorStart = addr & ~0x3FFFF;
+			mirrorSize = 0x40000;
 			return true;
 		}
 		return false;
-	case memregion_WRAM7:
-		if (num == 1)
+	case memregion_NewSharedWRAM_A:
 		{
-			if (NDS::SWRAM_ARM7.Mem)
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
 			{
-				mappingStart = 0x3800000;
-				mappingSize = 0x800000;
+				memoryOffset = ptr - DSi::NWRAM_A;
+				mirrorStart = addr & ~0xFFFF;
+				mirrorSize = 0x10000;
+				return true;
 			}
-			else
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
 			{
-				mappingStart = 0x3000000;
-				mappingSize = 0x1000000;
+				memoryOffset = ptr - DSi::NWRAM_B;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
 			}
-			memorySize = NDS::ARM7WRAMSize;
+			return false; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+			{
+				memoryOffset = ptr - DSi::NWRAM_C;
+				mirrorStart = addr & ~0x7FFF;
+				mirrorSize = 0x8000;
+				return true;
+			}
+			return false; // zero filled memory
+		}
+	case memregion_BIOS9DSi:
+		if (num == 0)
+		{
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000;
 			return true;
 		}
 		return false;
-	case memregion_VWRAM:
+	case memregion_BIOS7DSi:
 		if (num == 1)
 		{
-			mappingStart = 0x6000000;
-			mappingSize = 0x1000000;
-			memorySize = 0x20000;
+			mirrorStart = addr & ~0xFFFF;
+			mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000;
 			return true;
 		}
 		return false;
 	default:
-		// for the JIT we don't are about the rest
+		assert(false && "For the time being this should only be used for code");
 		return false;
 	}
 }
 
+u32 LocaliseAddress(int region, u32 num, u32 addr)
+{
+	switch (region)
+	{
+	case memregion_ITCM:
+		return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27);
+	case memregion_MainRAM:
+		return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27);
+	case memregion_BIOS9:
+		return (addr & 0xFFF) | (memregion_BIOS9 << 27);
+	case memregion_BIOS7:
+		return (addr & 0x3FFF) | (memregion_BIOS7 << 27);
+	case memregion_SharedWRAM:
+		if (num == 0)
+			return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+		else
+			return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27);
+	case memregion_WRAM7:
+		return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27);
+	case memregion_VRAM:
+		// TODO: take mapping properly into account
+		return (addr & 0xFFFFF) | (memregion_VRAM << 27);
+	case memregion_VWRAM:
+		// same here
+		return (addr & 0x3FFFF) | (memregion_VWRAM << 27);
+	case memregion_NewSharedWRAM_A:
+		{
+			u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27);
+			else
+				return memregion_Other << 27; // zero filled memory
+		}
+	case memregion_NewSharedWRAM_B:
+		{
+			u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_NewSharedWRAM_C:
+		{
+			u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]];
+			if (ptr)
+				return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27);
+			else
+				return memregion_Other << 27;
+		}
+	case memregion_BIOS9DSi:
+	case memregion_BIOS7DSi:
+		return (addr & 0xFFFF) | (region << 27);
+	default:
+		assert(false && "This should only be needed for regions which can contain code");
+		return memregion_Other << 27;
+	}
+}
+
 int ClassifyAddress9(u32 addr)
 {
 	if (addr < NDS::ARM9->ITCMSize)
+	{
 		return memregion_ITCM;
+	}
 	else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize))
+	{
 		return memregion_DTCM;
-	else if ((addr & 0xFFFFF000) == 0xFFFF0000)
-		return memregion_BIOS9;
-	else
+	}
+	else 
 	{
+		if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1)))
+		{
+			if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0)))
+				return memregion_Other;
+
+			return memregion_BIOS9DSi;
+		}
+		else if ((addr & 0xFFFFF000) == 0xFFFF0000)
+		{
+			return memregion_BIOS9;
+		}
+
 		switch (addr & 0xFF000000)
 		{
 		case 0x02000000:
 			return memregion_MainRAM;
 		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
 			if (NDS::SWRAM_ARM9.Mem)
-				return memregion_SWRAM;
-			else
-				return memregion_Other;
+				return memregion_SharedWRAM;
+			return memregion_Other;
 		case 0x04000000:
 			return memregion_IO9;
 		case 0x06000000:
 			return memregion_VRAM;
+		default:
+			return memregion_Other;
 		}
 	}
-	return memregion_Other;
 }
 
 int ClassifyAddress7(u32 addr)
 {
-	if (addr < 0x00004000)
+	if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9)))
+    {
+        if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8))
+            return memregion_Other;
+
+        return memregion_BIOS7DSi;
+    }
+	else if (addr < 0x00004000)
+	{
 		return memregion_BIOS7;
+	}
 	else
 	{
 		switch (addr & 0xFF800000)
@@ -657,10 +868,19 @@ int ClassifyAddress7(u32 addr)
 		case 0x02800000:
 			return memregion_MainRAM;
 		case 0x03000000:
+			if (NDS::ConsoleType == 1)
+			{
+				if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0])
+					return memregion_NewSharedWRAM_A;
+				if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1])
+					return memregion_NewSharedWRAM_B;
+				if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2])
+					return memregion_NewSharedWRAM_C;
+			}
+
 			if (NDS::SWRAM_ARM7.Mem)
-				return memregion_SWRAM;
-			else
-				return memregion_WRAM7;
+				return memregion_SharedWRAM;
+			return memregion_WRAM7;
 		case 0x03800000:
 			return memregion_WRAM7;
 		case 0x04000000:
@@ -740,14 +960,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 				}
 			}
 
-			switch (size | store)
+			if (NDS::ConsoleType == 0)
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM9IORead8;
+				case 9: return (void*)NDS::ARM9IOWrite8;
+				case 16: return (void*)NDS::ARM9IORead16;
+				case 17: return (void*)NDS::ARM9IOWrite16;
+				case 32: return (void*)NDS::ARM9IORead32;
+				case 33: return (void*)NDS::ARM9IOWrite32;
+				}
+			}
+			else
 			{
-			case 8: return (void*)NDS::ARM9IORead8;
-			case 9: return (void*)NDS::ARM9IOWrite8;
-			case 16: return (void*)NDS::ARM9IORead16;
-			case 17: return (void*)NDS::ARM9IOWrite16;
-			case 32: return (void*)NDS::ARM9IORead32;
-			case 33: return (void*)NDS::ARM9IOWrite32;
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM9IORead8;
+				case 9: return (void*)DSi::ARM9IOWrite8;
+				case 16: return (void*)DSi::ARM9IORead16;
+				case 17: return (void*)DSi::ARM9IOWrite16;
+				case 32: return (void*)DSi::ARM9IORead32;
+				case 33: return (void*)DSi::ARM9IOWrite32;
+				}
 			}
 			break;
 		case 0x06000000:
@@ -781,14 +1016,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size)
 				}
 			}
 
-			switch (size | store)
+			if (NDS::ConsoleType == 0)
 			{
-			case 8: return (void*)NDS::ARM7IORead8;
-			case 9: return (void*)NDS::ARM7IOWrite8;		
-			case 16: return (void*)NDS::ARM7IORead16;
-			case 17: return (void*)NDS::ARM7IOWrite16;
-			case 32: return (void*)NDS::ARM7IORead32;
-			case 33: return (void*)NDS::ARM7IOWrite32;
+				switch (size | store)
+				{
+				case 8: return (void*)NDS::ARM7IORead8;
+				case 9: return (void*)NDS::ARM7IOWrite8;		
+				case 16: return (void*)NDS::ARM7IORead16;
+				case 17: return (void*)NDS::ARM7IOWrite16;
+				case 32: return (void*)NDS::ARM7IORead32;
+				case 33: return (void*)NDS::ARM7IOWrite32;
+				}
+			}
+			else
+			{
+				switch (size | store)
+				{
+				case 8: return (void*)DSi::ARM7IORead8;
+				case 9: return (void*)DSi::ARM7IOWrite8;		
+				case 16: return (void*)DSi::ARM7IORead16;
+				case 17: return (void*)DSi::ARM7IOWrite16;
+				case 32: return (void*)DSi::ARM7IORead32;
+				case 33: return (void*)DSi::ARM7IOWrite32;
+				}
 			}
 			break;
 		case 0x04800000:
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
index 1a59d98..123e18e 100644
--- a/src/ARMJIT_Memory.h
+++ b/src/ARMJIT_Memory.h
@@ -23,7 +23,7 @@ enum
 	memregion_DTCM,
 	memregion_BIOS9,
 	memregion_MainRAM,
-	memregion_SWRAM,
+	memregion_SharedWRAM,
 	memregion_IO9,
 	memregion_VRAM,
 	memregion_BIOS7,
@@ -31,18 +31,28 @@ enum
 	memregion_IO7,
 	memregion_Wifi,
 	memregion_VWRAM,
+
+	// DSi
+	memregion_BIOS9DSi,
+	memregion_BIOS7DSi,
+	memregion_NewSharedWRAM_A,
+	memregion_NewSharedWRAM_B,
+	memregion_NewSharedWRAM_C,
+
 	memregions_Count
 };
 
 int ClassifyAddress9(u32 addr);
 int ClassifyAddress7(u32 addr);
 
-bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize);
+bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize);
+u32 LocaliseAddress(int region, u32 num, u32 addr);
 
-bool IsMappable(int region);
+bool IsFastmemCompatible(int region);
 
 void RemapDTCM(u32 newBase, u32 newSize);
 void RemapSWRAM();
+void RemapNWRAM(int num);
 
 void SetCodeProtection(int region, u32 offset, bool protect);
 
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index 34c1c91..d8bdd56 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -40,6 +40,12 @@ const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
 #endif
 ;
 
+#ifdef _WIN32
+const BitSet32 CallerSavedPushRegs({R10, R11});
+#else
+const BitSet32 CallerSavedPushRegs({R9, R10, R11});
+#endif
+
 void Compiler::PushRegs(bool saveHiRegs)
 {
     BitSet32 loadedRegs(RegCache.LoadedRegs);
@@ -301,6 +307,107 @@ Compiler::Compiler()
         RET();
     }
 
+    for (int consoleType = 0; consoleType < 2; consoleType++)
+    {
+        for (int num = 0; num < 2; num++)
+        {
+            for (int size = 0; size < 3; size++)
+            {
+                for (int reg = 0; reg < 16; reg++)
+                {
+                    if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3)
+                    {
+                        PatchedStoreFuncs[consoleType][num][size][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL;
+                        PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL;
+                        continue;
+                    }
+
+                    X64Reg rdMapped = (X64Reg)reg;
+                    PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr();
+                    if (RSCRATCH3 != ABI_PARAM1)
+                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                    if (num == 0)
+                    {
+                        MOV(64, R(ABI_PARAM2), R(RCPU));
+                        MOV(32, R(ABI_PARAM3), R(rdMapped));
+                    }
+                    else
+                    {
+                        MOV(32, R(ABI_PARAM2), R(rdMapped));
+                    }
+                    ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    if (consoleType == 0)
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 0>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 0>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 0>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 0>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 0>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 0>); break;
+                        }
+                    }
+                    else
+                    {
+                        switch ((8 << size) |  num)
+                        {
+                        case 32: ABI_CallFunction(SlowWrite9<u32, 1>); break;
+                        case 33: ABI_CallFunction(SlowWrite7<u32, 1>); break;
+                        case 16: ABI_CallFunction(SlowWrite9<u16, 1>); break;
+                        case 17: ABI_CallFunction(SlowWrite7<u16, 1>); break;
+                        case 8: ABI_CallFunction(SlowWrite9<u8, 1>); break;
+                        case 9: ABI_CallFunction(SlowWrite7<u8, 1>); break;
+                        }
+                    }
+                    ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                    RET();
+
+                    for (int signextend = 0; signextend < 2; signextend++)
+                    {
+                        PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr();
+                        if (RSCRATCH3 != ABI_PARAM1)
+                            MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+                        if (num == 0)
+                            MOV(64, R(ABI_PARAM2), R(RCPU));
+                        ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (consoleType == 0)
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 0>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 0>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 0>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 0>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 0>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 0>); break;
+                            }
+                        }
+                        else
+                        {
+                            switch ((8 << size) |  num)
+                            {
+                            case 32: ABI_CallFunction(SlowRead9<u32, 1>); break;
+                            case 33: ABI_CallFunction(SlowRead7<u32, 1>); break;
+                            case 16: ABI_CallFunction(SlowRead9<u16, 1>); break;
+                            case 17: ABI_CallFunction(SlowRead7<u16, 1>); break;
+                            case 8: ABI_CallFunction(SlowRead9<u8, 1>); break;
+                            case 9: ABI_CallFunction(SlowRead7<u8, 1>); break;
+                            }
+                        }
+                        ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
+                        if (signextend)
+                            MOVSX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        else
+                            MOVZX(32, 8 << size, rdMapped, R(RSCRATCH));
+                        RET();
+                    }
+                }
+            }
+        }
+    }
+
     // move the region forward to prevent overwriting the generated functions
     CodeMemSize -= GetWritableCodePtr() - ResetStart;
     ResetStart = GetWritableCodePtr();
@@ -500,6 +607,8 @@ void Compiler::Reset()
 
     NearCode = NearStart;
     FarCode = FarStart;
+
+    LoadStorePatches.clear();
 }
 
 bool Compiler::IsJITFault(u64 addr)
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index d1a6c07..0fe0147 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -7,6 +7,8 @@
 #include "../ARMJIT_Internal.h"
 #include "../ARMJIT_RegisterCache.h"
 
+#include <unordered_map>
+
 namespace ARMJIT
 {
 
@@ -18,6 +20,13 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX;
 const Gen::X64Reg RSCRATCH3 = Gen::ECX;
 const Gen::X64Reg RSCRATCH4 = Gen::R8;
 
+struct LoadStorePatch
+{
+    void* PatchFunc;
+    s16 Offset;
+    u16 Size;
+};
+
 struct Op2
 {
     Op2()
@@ -211,6 +220,11 @@ public:
     u8* NearStart;
     u8* FarStart;
 
+    void* PatchedStoreFuncs[2][2][3][16];
+    void* PatchedLoadFuncs[2][2][3][2][16];
+
+    std::unordered_map<u8*, LoadStorePatch> LoadStorePatches;
+
     u8* ResetStart;
     u32 CodeMemSize;
 
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index b780c55..2da113b 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -17,7 +17,30 @@ int squeezePointer(T* ptr)
 
 s32 Compiler::RewriteMemAccess(u64 pc)
 {
-    return 0;
+    auto it = LoadStorePatches.find((u8*)pc);
+    if (it != LoadStorePatches.end())
+    {
+        LoadStorePatch patch = it->second;
+        LoadStorePatches.erase(it);
+
+        u8* curCodePtr = GetWritableCodePtr();
+        u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset;
+        SetCodePtr(rewritePtr);
+
+        CALL(patch.PatchFunc);
+        u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr);
+        if (remainingSize > 0)
+            NOP(remainingSize);
+
+        //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size);
+
+        SetCodePtr(curCodePtr);
+
+        return patch.Offset;
+    }
+
+    printf("this is a JIT bug %x\n", pc);
+    abort();
 }
 
 /*
@@ -91,369 +114,213 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag
             return;
     }
 
+    if (flags & memop_Store)
     {
-        if (flags & memop_Store)
-        {
-            Comp_AddCycles_CD();
-        }
-        else
-        {
-            Comp_AddCycles_CDI();
-        }
+        Comp_AddCycles_CD();
+    }
+    else
+    {
+        Comp_AddCycles_CDI();
+    }
 
-        bool addrIsStatic = Config::JIT_LiteralOptimisations
-            && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
-        u32 staticAddress;
-        if (addrIsStatic)
-            staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
-        OpArg rdMapped = MapReg(rd);
+    bool addrIsStatic = Config::JIT_LiteralOptimisations
+        && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post));
+    u32 staticAddress;
+    if (addrIsStatic)
+        staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+    OpArg rdMapped = MapReg(rd);
 
-        if (true)
-        {
-            OpArg rnMapped = MapReg(rn);
-            if (Thumb && rn == 15)
-                rnMapped = Imm32(R15 & ~0x2);
+    OpArg rnMapped = MapReg(rn);
+    if (Thumb && rn == 15)
+        rnMapped = Imm32(R15 & ~0x2);
 
-            X64Reg finalAddr = RSCRATCH3;
-            if (flags & memop_Post)
-            {
-                MOV(32, R(RSCRATCH3), rnMapped);
+    X64Reg finalAddr = RSCRATCH3;
+    if (flags & memop_Post)
+    {
+        MOV(32, R(RSCRATCH3), rnMapped);
 
-                finalAddr = rnMapped.GetSimpleReg();
-            }
+        finalAddr = rnMapped.GetSimpleReg();
+    }
 
-            if (op2.IsImm)
+    if (op2.IsImm)
+    {
+        MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+    }
+    else
+    {
+        OpArg rm = MapReg(op2.Reg.Reg);
+
+        if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
+            && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
+        {
+            LEA(32, finalAddr, 
+                MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
+        }
+        else
+        {
+            bool throwAway;
+            OpArg offset =
+                Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+
+            if (flags & memop_SubtractOffset)
             {
-                MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)));
+                if (R(finalAddr) != rnMapped)
+                    MOV(32, R(finalAddr), rnMapped);
+                if (!offset.IsZero())
+                    SUB(32, R(finalAddr), offset);
             }
             else
-            {
-                OpArg rm = MapReg(op2.Reg.Reg);
+                MOV_sum(32, finalAddr, rnMapped, offset);
+        }
+    }
 
-                if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg()
-                    && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3)
-                {
-                    LEA(32, finalAddr, 
-                        MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0));
-                }
-                else
-                {
-                    bool throwAway;
-                    OpArg offset =
-                        Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway);
+    if ((flags & memop_Writeback) && !(flags & memop_Post))
+        MOV(32, rnMapped, R(finalAddr));
 
-                    if (flags & memop_SubtractOffset)
-                    {
-                        if (R(finalAddr) != rnMapped)
-                            MOV(32, R(finalAddr), rnMapped);
-                        if (!offset.IsZero())
-                            SUB(32, R(finalAddr), offset);
-                    }
-                    else
-                        MOV_sum(32, finalAddr, rnMapped, offset);
-                }
-            }
+    u32 expectedTarget = Num == 0
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
-            if ((flags & memop_Writeback) && !(flags & memop_Post))
-                MOV(32, rnMapped, R(finalAddr));
-        }
+    if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)))
+    {
+        u8* memopStart = GetWritableCodePtr();
+        LoadStorePatch patch;
+
+        patch.PatchFunc = flags & memop_Store
+            ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()]
+            : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()];
 
-        /*int expectedTarget = Num == 0
-            ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) 
-            : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
-        if (CurInstr.Cond() < 0xE)
-            expectedTarget = memregion_Other;
+        assert(patch.PatchFunc != NULL);
 
-        bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store);
+        MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
 
-        switch (expectedTarget)
+        X64Reg maskedAddr = RSCRATCH3;
+        if (size > 8)
         {
-        case memregion_MainRAM:
-        case memregion_DTCM:
-        case memregion_WRAM7:
-        case memregion_SWRAM9:
-        case memregion_SWRAM7:
-        case memregion_IO9:
-        case memregion_IO7:
-        case memregion_VWRAM:
-            compileFastPath = true;
-            break;
-        case memregion_Wifi:
-            compileFastPath = size >= 16;
-            break;
-        case memregion_VRAM:
-            compileFastPath = !(flags & memop_Store) || size >= 16;
-        case memregion_BIOS9:
-            compileFastPath = !(flags & memop_Store);
-            break;
-        default: break;
+            maskedAddr = RSCRATCH2;
+            MOV(32, R(RSCRATCH2), R(RSCRATCH3));
+            AND(32, R(RSCRATCH2), Imm8(addressMask));
         }
 
-        if (addrIsStatic && !compileFastPath)
+        u8* memopLoadStoreLocation = GetWritableCodePtr();
+        if (flags & memop_Store)
         {
-            compileFastPath = false;
-            compileSlowPath = true;
+            MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped);
         }
-
-        if (addrIsStatic && compileSlowPath)
-            MOV(32, R(RSCRATCH3), Imm32(staticAddress));
-*/
-        /*if (compileFastPath)
+        else
         {
-            FixupBranch slowPath;
-            if (compileSlowPath)
-            {
-                MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                SHR(32, R(RSCRATCH), Imm8(9));
-                if (flags & memop_Store)
-                {
-                    CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
-                }
-                else
-                {
-                    MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
-                    AND(32, R(RSCRATCH), Imm8(~0x80));
-                    CMP(32, R(RSCRATCH), Imm8(expectedTarget));
-                }
-
-                slowPath = J_CC(CC_NE, true);
-            }
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr));
 
-            if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7
-                || expectedTarget == memregion_BIOS9)
+            if (size == 32)
             {
-                u8* data;
-                u32 mask;
-                if (expectedTarget == memregion_MainRAM)
-                {
-                    data = NDS::MainRAM;
-                    mask = MAIN_RAM_SIZE - 1;
-                }
-                else if (expectedTarget == memregion_BIOS9)
-                {
-                    data = NDS::ARM9BIOS;
-                    mask = 0xFFF;
-                }
-                else
-                {
-                    data = NDS::ARM7WRAM;
-                    mask = 0xFFFF;
-                }
-                OpArg memLoc;
-                if (addrIsStatic)
-                {
-                    memLoc = M(data + ((staticAddress & mask & addressMask)));
-                }
-                else
-                {
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                    AND(32, R(RSCRATCH), Imm32(mask & addressMask));
-                    memLoc = MDisp(RSCRATCH, squeezePointer(data));
-                }
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
-            }
-            else if (expectedTarget == memregion_DTCM)
-            {
-                if (addrIsStatic)
-                    MOV(32, R(RSCRATCH), Imm32(staticAddress));
-                else
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-                AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
-                OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM));
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
-            }
-            else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7)
-            {
-                MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
-                if (addrIsStatic)
-                {
-                    MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask));
-                }
-                else
-                {
-                    MOV(32, R(RSCRATCH), R(RSCRATCH3));
-                    AND(32, R(RSCRATCH), Imm8(addressMask));
-                }
-                AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
-                OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2);
-                if (flags & memop_Store)
-                    MOV(size, memLoc, rdMapped);
-                else if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc);
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc);
+                AND(32, R(RSCRATCH3), Imm8(0x3));
+                SHL(32, R(RSCRATCH3), Imm8(3));
+                ROR_(32, rdMapped, R(RSCRATCH3));
             }
-            else
-            {
-                u32 maskedDataRegion;
-
-                if (addrIsStatic)
-                {
-                    maskedDataRegion = staticAddress;
-                    MOV(32, R(ABI_PARAM1), Imm32(staticAddress));
-                }
-                else
-                {
-                    if (ABI_PARAM1 != RSCRATCH3)
-                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                    AND(32, R(ABI_PARAM1), Imm8(addressMask));
-
-                    maskedDataRegion = CurInstr.DataRegion;
-                    if (Num == 0)
-                        maskedDataRegion &= ~0xFFFFFF;
-                    else
-                        maskedDataRegion &= ~0x7FFFFF;
-                }
+        }
 
-                void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size);
+        patch.Offset = memopStart - memopLoadStoreLocation;
+        patch.Size = GetWritableCodePtr() - memopStart;
 
-                if (flags & memop_Store)
-                {
-                    PushRegs(false);
+        assert(patch.Size >= 5);
 
-                    MOV(32, R(ABI_PARAM2), rdMapped);
+        LoadStorePatches[memopLoadStoreLocation] = patch;
+    }
+    else
+    {
+        PushRegs(false);
 
-                    ABI_CallFunction((void(*)())func);
+        if (Num == 0)
+        {
+            MOV(64, R(ABI_PARAM2), R(RCPU));
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
+            {
+                MOV(32, R(ABI_PARAM3), rdMapped);
 
-                    PopRegs(false);
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    if (!addrIsStatic)
-                        MOV(32, rdMapped, R(RSCRATCH3));
-
-                    PushRegs(false);
-
-                    ABI_CallFunction((void(*)())func);
-
-                    PopRegs(false);
-
-                    if (!addrIsStatic)
-                        MOV(32, R(RSCRATCH3), rdMapped);
-
-                    if (flags & memop_SignExtend)
-                        MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-                    else
-                        MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+                case 32: CALL((void*)&SlowWrite9<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite9<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite9<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite9<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite9<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite9<u8, 1>); break;
                 }
             }
-
-            if ((size == 32 && !(flags & memop_Store)))
+            else
             {
-                if (addrIsStatic)
-                {
-                    if (staticAddress & 0x3)
-                        ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8));
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    AND(32, R(RSCRATCH3), Imm8(0x3));
-                    SHL(32, R(RSCRATCH3), Imm8(3));
-                    ROR_(32, rdMapped, R(RSCRATCH3));
+                case 32: CALL((void*)&SlowRead9<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead9<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead9<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead9<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead9<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead9<u8, 1>); break;
                 }
             }
-
-            if (compileSlowPath)
-            {
-                SwitchToFarCode();
-                SetJumpTarget(slowPath);
-            }
         }
-*/
-        if (true)
+        else
         {
-            PushRegs(false);
-
-            if (Num == 0)
+            if (ABI_PARAM1 != RSCRATCH3)
+                MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
+            if (flags & memop_Store)
             {
-                MOV(64, R(ABI_PARAM2), R(RCPU));
-                if (ABI_PARAM1 != RSCRATCH3)
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                if (flags & memop_Store)
-                {
-                    MOV(32, R(ABI_PARAM3), rdMapped);
+                MOV(32, R(ABI_PARAM2), rdMapped);
 
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowWrite9<u32>); break;
-                    case 16: CALL((void*)&SlowWrite9<u16>); break;
-                    case 8: CALL((void*)&SlowWrite9<u8>); break;
-                    }
-                }
-                else
+                switch (size | NDS::ConsoleType)
                 {
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowRead9<u32>); break;
-                    case 16: CALL((void*)&SlowRead9<u16>); break;
-                    case 8: CALL((void*)&SlowRead9<u8>); break;
-                    }
+                case 32: CALL((void*)&SlowWrite7<u32, 0>); break;
+                case 16: CALL((void*)&SlowWrite7<u16, 0>); break;
+                case 8: CALL((void*)&SlowWrite7<u8, 0>); break;
+                case 33: CALL((void*)&SlowWrite7<u32, 1>); break;
+                case 17: CALL((void*)&SlowWrite7<u16, 1>); break;
+                case 9: CALL((void*)&SlowWrite7<u8, 1>); break;
                 }
             }
             else
             {
-                if (ABI_PARAM1 != RSCRATCH3)
-                    MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
-                if (flags & memop_Store)
+                switch (size | NDS::ConsoleType)
                 {
-                    MOV(32, R(ABI_PARAM2), rdMapped);
-
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowWrite7<u32>); break;
-                    case 16: CALL((void*)&SlowWrite7<u16>); break;
-                    case 8: CALL((void*)&SlowWrite7<u8>); break;
-                    }
-                }
-                else
-                {
-                    switch (size)
-                    {
-                    case 32: CALL((void*)&SlowRead7<u32>); break;
-                    case 16: CALL((void*)&SlowRead7<u16>); break;
-                    case 8: CALL((void*)&SlowRead7<u8>); break;
-                    }
+                case 32: CALL((void*)&SlowRead7<u32, 0>); break;
+                case 16: CALL((void*)&SlowRead7<u16, 0>); break;
+                case 8: CALL((void*)&SlowRead7<u8, 0>); break;
+                case 33: CALL((void*)&SlowRead7<u32, 1>); break;
+                case 17: CALL((void*)&SlowRead7<u16, 1>); break;
+                case 9: CALL((void*)&SlowRead7<u8, 1>); break;
                 }
             }
+        }
 
-            PopRegs(false);
+        PopRegs(false);
 
-            if (!(flags & memop_Store))
-            {
-                if (flags & memop_SignExtend)
-                    MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-                else
-                    MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
-            }
-        }
-/*
-        if (compileFastPath && compileSlowPath)
+        if (!(flags & memop_Store))
         {
-            FixupBranch ret = J(true);
-            SwitchToNearCode();
-            SetJumpTarget(ret);
-        }*/
+            if (flags & memop_SignExtend)
+                MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+            else
+                MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH));
+        }
+    }
 
-        if (!(flags & memop_Store) && rd == 15)
+    if (!(flags & memop_Store) && rd == 15)
+    {
+        if (size < 32)
+            printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
         {
-            if (size < 32)
-                printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr);
+            if (Num == 1)
             {
-                if (Num == 1)
-                    AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended
-                Comp_JumpTo(rdMapped.GetSimpleReg());
+                if (Thumb)
+                    OR(32, rdMapped, Imm8(0x1));
+                else
+                    AND(32, rdMapped, Imm8(0xFE));
             }
+            Comp_JumpTo(rdMapped.GetSimpleReg());
         }
     }
 }
@@ -470,7 +337,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         int flags = 0;
         if (store)
             flags |= memop_Store;
-        if (decrement)
+        if (decrement && preinc)
             flags |= memop_SubtractOffset;
         Op2 offset = preinc ? Op2(4) : Op2(0);
 
@@ -481,96 +348,52 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
 
     s32 offset = (regsCount * 4) * (decrement ? -1 : 1);
 
-    // we need to make sure that the stack stays aligned to 16 bytes
-#ifdef _WIN32
-    // include shadow
-    u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8;
-#else
-    u32 stackAlloc = ((regsCount + 1) & ~1) * 8;
-#endif
-    u32 allocOffset = stackAlloc - regsCount * 8;
-/*
     int expectedTarget = Num == 0
-        ? ClassifyAddress9(CurInstr.DataRegion)
-        : ClassifyAddress7(CurInstr.DataRegion);
-    if (usermode || CurInstr.Cond() < 0xE)
-        expectedTarget = memregion_Other;
-
-    bool compileFastPath = false;
+        ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+        : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
 
-    switch (expectedTarget)
-    {
-    case memregion_DTCM:
-    case memregion_MainRAM:
-    case memregion_SWRAM9:
-    case memregion_SWRAM7:
-    case memregion_WRAM7:
-        compileFastPath = true;
-        break;
-    default:
-        break;
-    }
-*/
     if (!store)
         Comp_AddCycles_CDI();
     else
         Comp_AddCycles_CD();
 
+    bool compileFastPath = Config::JIT_FastMemory
+        && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget));
+
+    // we need to make sure that the stack stays aligned to 16 bytes
+#ifdef _WIN32
+    // include shadow
+    u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#else
+    u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8;
+#endif
+    u32 allocOffset = stackAlloc - regsCount * 8;
+
     if (decrement)
-    {
-        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4));
-        preinc ^= true;
-    }
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4)));
     else
-        MOV(32, R(RSCRATCH4), MapReg(rn));
-/*
+        MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0));
+
     if (compileFastPath)
     {
-        assert(!usermode);
+        AND(32, R(RSCRATCH4), Imm8(~3));
 
-        MOV(32, R(RSCRATCH), R(RSCRATCH4));
-        SHR(32, R(RSCRATCH), Imm8(9));
+        u8* fastPathStart = GetWritableCodePtr();
+        u8* firstLoadStoreAddr;
 
-        if (store)
-        {
-            CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget));
-        }
-        else
-        {
-            MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)));
-            AND(32, R(RSCRATCH), Imm8(~0x80));
-            CMP(32, R(RSCRATCH), Imm8(expectedTarget));
-        }
-        FixupBranch slowPath = J_CC(CC_NE, true);
+        bool firstLoadStore = true;
+
+        MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start));
+        ADD(64, R(RSCRATCH2), R(RSCRATCH4));
+        MOV(32, R(RSCRATCH3), R(RSCRATCH4));
 
-        if (expectedTarget == memregion_DTCM)
-        {
-            SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
-            AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3));
-            LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM)));
-        }
-        else if (expectedTarget == memregion_MainRAM)
-        {
-            AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3));
-            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM)));
-        }
-        else if (expectedTarget == memregion_WRAM7)
-        {
-            AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3));
-            ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM)));
-        }
-        else // SWRAM
-        {
-            AND(32, R(RSCRATCH4), Imm8(~3));
-            AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask));
-            ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7));
-        }
         u32 offset = 0;
         for (int reg : regs)
         {
-            if (preinc)
-                offset += 4;
-            OpArg mem = MDisp(RSCRATCH4, offset);
+            if (firstLoadStore)
+                firstLoadStoreAddr = GetWritableCodePtr();
+
+            OpArg mem = MDisp(RSCRATCH2, offset);
             if (store)
             {
                 if (RegCache.LoadedRegs & (1 << reg))
@@ -580,6 +403,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                 else
                 {
                     LoadReg(reg, RSCRATCH);
+                    if (firstLoadStore)
+                        firstLoadStoreAddr = GetWritableCodePtr();
                     MOV(32, mem, R(RSCRATCH));
                 }
             }
@@ -595,13 +420,19 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
                     SaveReg(reg, RSCRATCH);
                 }
             }
-            if (!preinc)
-                offset += 4;
+            offset += 4;
+
+            firstLoadStore = false;
         }
 
+        LoadStorePatch patch;
+        patch.Size = GetWritableCodePtr() - fastPathStart;
+        patch.Offset = fastPathStart - firstLoadStoreAddr;
         SwitchToFarCode();
-        SetJumpTarget(slowPath);
-    }*/
+        patch.PatchFunc = GetWritableCodePtr();
+
+        LoadStorePatches[firstLoadStoreAddr] = patch;
+    }
 
     if (!store)
     {
@@ -618,12 +449,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (Num == 0)
             MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        switch (Num * 2 | preinc)
+        switch (Num * 2 | NDS::ConsoleType)
         {
-        case 0: CALL((void*)&SlowBlockTransfer9<false, false>); break;
-        case 1: CALL((void*)&SlowBlockTransfer9<true, false>); break;
-        case 2: CALL((void*)&SlowBlockTransfer7<false, false>); break;
-        case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break;
+        case 0: CALL((void*)&SlowBlockTransfer9<false, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<false, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<false, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<false, 1>); break;
         }
 
         PopRegs(false);
@@ -715,25 +546,24 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
         if (Num == 0)
             MOV(64, R(ABI_PARAM4), R(RCPU));
 
-        switch (Num * 2 | preinc)
+        switch (Num * 2 | NDS::ConsoleType)
         {
-        case 0: CALL((void*)&SlowBlockTransfer9<false, true>); break;
-        case 1: CALL((void*)&SlowBlockTransfer9<true, true>); break;
-        case 2: CALL((void*)&SlowBlockTransfer7<false, true>); break;
-        case 3: CALL((void*)&SlowBlockTransfer7<true, true>); break;
+        case 0: CALL((void*)&SlowBlockTransfer9<true, 0>); break;
+        case 1: CALL((void*)&SlowBlockTransfer9<true, 1>); break;
+        case 2: CALL((void*)&SlowBlockTransfer7<true, 0>); break;
+        case 3: CALL((void*)&SlowBlockTransfer7<true, 1>); break;
         }
 
         ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc));
     
         PopRegs(false);
     }
-/*
+
     if (compileFastPath)
     {
-        FixupBranch ret = J(true);
+        RET();
         SwitchToNearCode();
-        SetJumpTarget(ret);
-    }*/
+    }
 
     if (!store && regs[15])
     {
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 3d64259..992c83f 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -608,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val)
         ITCMSetting = val;
         UpdateITCMSetting();
         return;
+
+    case 0xF00:
+        //printf("cache debug index register %08X\n", val);
+        return;
+    
+    case 0xF10:
+        //printf("cache debug instruction tag %08X\n", val);
+        return;
+    
+    case 0xF20:
+        //printf("cache debug data tag %08X\n", val);
+        return;
+
+    case 0xF30:
+        //printf("cache debug instruction cache %08X\n", val);
+        return;
+
+    case 0xF40:
+        //printf("cache debug data cache %08X\n", val);
+        return;
+    
     }
 
     if ((id&0xF00)!=0x700)
diff --git a/src/Config.cpp b/src/Config.cpp
index edf84f2..de1c70d 100644
--- a/src/Config.cpp
+++ b/src/Config.cpp
@@ -40,14 +40,7 @@ char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 int JIT_Enable = false;
 int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = 2;
-int JIT_LiteralOptimisations = true;
-#endif
-
-#ifdef JIT_ENABLED
-int JIT_Enable = false;
-int JIT_MaxBlockSize = 32;
-int JIT_BrancheOptimisations = true;
+int JIT_BranchOptimisations = 2;
 int JIT_LiteralOptimisations = true;
 int JIT_FastMemory = true;
 #endif
@@ -66,16 +59,9 @@ ConfigEntry ConfigFile[] =
 #ifdef JIT_ENABLED
     {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
     {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0},
-    {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
-#endif
-
-#ifdef JIT_ENABLED
-    {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0},
-    {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0},
-    {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0},
+    {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0},
     {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0},
-    {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0},
+    {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0},
 #endif
 
     {"", -1, NULL, 0, NULL, 0}
diff --git a/src/Config.h b/src/Config.h
index 7b19a4b..5916b4a 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -54,14 +54,7 @@ extern char DSiNANDPath[1024];
 #ifdef JIT_ENABLED
 extern int JIT_Enable;
 extern int JIT_MaxBlockSize;
-extern int JIT_BrancheOptimisations;
-extern int JIT_LiteralOptimisations;
-#endif
-
-#ifdef JIT_ENABLED
-extern int JIT_Enable;
-extern int JIT_MaxBlockSize;
-extern int JIT_BrancheOptimisations;
+extern int JIT_BranchOptimisations;
 extern int JIT_LiteralOptimisations;
 extern int JIT_FastMemory;
 #endif
diff --git a/src/DSi.cpp b/src/DSi.cpp
index 216f724..97a63cd 100644
--- a/src/DSi.cpp
+++ b/src/DSi.cpp
@@ -26,6 +26,11 @@
 #include "NDSCart.h"
 #include "Platform.h"
 
+#ifdef JIT_ENABLED
+#include "ARMJIT.h"
+#include "ARMJIT_Memory.h"
+#endif
+
 #include "DSi_NDMA.h"
 #include "DSi_I2C.h"
 #include "DSi_SD.h"
@@ -34,15 +39,6 @@
 #include "tiny-AES-c/aes.hpp"
 
 
-namespace NDS
-{
-
-extern ARMv5* ARM9;
-extern ARMv4* ARM7;
-
-}
-
-
 namespace DSi
 {
 
@@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000];
 
 u32 MBK[2][9];
 
-u8 NWRAM_A[0x40000];
-u8 NWRAM_B[0x40000];
-u8 NWRAM_C[0x40000];
+u8* NWRAM_A;
+u8* NWRAM_B;
+u8* NWRAM_C;
 
 u8* NWRAMMap_A[2][4];
 u8* NWRAMMap_B[3][8];
@@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00];
 
 bool Init()
 {
+#ifndef JIT_ENABLED
+    NWRAM_A = new u8[NWRAMSize];
+    NWRAM_B = new u8[NWRAMSize];
+    NWRAM_C = new u8[NWRAMSize];
+#endif
+
     if (!DSi_I2C::Init()) return false;
     if (!DSi_AES::Init()) return false;
 
@@ -106,6 +108,12 @@ bool Init()
 
 void DeInit()
 {
+#ifndef JIT_ENABLED
+    delete[] NWRAM_A;
+    delete[] NWRAM_B;
+    delete[] NWRAM_C;
+#endif
+
     DSi_I2C::DeInit();
     DSi_AES::DeInit();
 
@@ -176,7 +184,12 @@ void SoftReset()
     NDS::ARM9->Reset();
     NDS::ARM7->Reset();
 
+    NDS::ARM9->CP15Reset();
+
     memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000);
+#ifdef JIT_ENABLED
+    ARMJIT::CheckAndInvalidateITCM();
+#endif
 
     DSi_AES::Reset();
 
@@ -274,9 +287,9 @@ bool LoadNAND()
 {
     printf("Loading DSi NAND\n");
 
-    memset(NWRAM_A, 0, 0x40000);
-    memset(NWRAM_B, 0, 0x40000);
-    memset(NWRAM_C, 0, 0x40000);
+    memset(NWRAM_A, 0, NWRAMSize);
+    memset(NWRAM_B, 0, NWRAMSize);
+    memset(NWRAM_C, 0, NWRAMSize);
 
     memset(MBK, 0, sizeof(MBK));
     memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A));
@@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(0);
+
     int mbkn = 0, mbks = 8*num;
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(1);
+
     int mbkn = 1+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val)
         return;
     }
 
+    ARMJIT_Memory::RemapNWRAM(2);
+
     int mbkn = 3+(num>>2), mbks = 8*(num&3);
 
     u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF;
@@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val)
     u32 oldval = MBK[cpu][5+num];
     if (oldval == val) return;
 
+    ARMJIT_Memory::RemapNWRAM(num);
+
     MBK[cpu][5+num] = val;
 
     // TODO: what happens when the ranges are 'out of range'????
@@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write8(addr, val);
@@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write16(addr, val);
@@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0])
         {
             u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1])
         {
             u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2])
         {
             u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM9Write32(addr, val);
@@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0xFFFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+#endif
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u8*)&ptr[addr & 0x7FFF] = val;
+#ifdef JIT_ENABLED
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+#endif
+            }
             return;
         }
         return NDS::ARM7Write8(addr, val);
@@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u16*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write16(addr, val);
@@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val)
         if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0])
         {
             u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]];
-            if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0xFFFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1])
         {
             u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr);
+            }
             return;
         }
         if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2])
         {
             u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]];
-            if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val;
+            if (ptr)
+            {
+                *(u32*)&ptr[addr & 0x7FFF] = val;
+                ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr);
+            }
             return;
         }
         return NDS::ARM7Write32(addr, val);
@@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr)
     case 0x04004501: return DSi_I2C::Cnt;
 
     case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF;
-    case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
+    case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF;
     case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF;
     case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF;
     case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF;
diff --git a/src/DSi.h b/src/DSi.h
index 8cc8fd5..40f22bb 100644
--- a/src/DSi.h
+++ b/src/DSi.h
@@ -25,6 +25,8 @@
 namespace DSi
 {
 
+extern u16 SCFG_BIOS;
+
 extern u8 ARM9iBIOS[0x10000];
 extern u8 ARM7iBIOS[0x10000];
 
@@ -34,6 +36,19 @@ extern u64 ConsoleID;
 extern DSi_SDHost* SDMMC;
 extern DSi_SDHost* SDIO;
 
+const u32 NWRAMSize = 0x40000;
+
+extern u8* NWRAM_A;
+extern u8* NWRAM_B;
+extern u8* NWRAM_C;
+
+extern u8* NWRAMMap_A[2][4];
+extern u8* NWRAMMap_B[3][8];
+extern u8* NWRAMMap_C[3][8];
+
+extern u32 NWRAMStart[2][3];
+extern u32 NWRAMEnd[2][3];
+extern u32 NWRAMMask[2][3];
 
 bool Init();
 void DeInit();
diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp
index 9984f5e..e22c708 100644
--- a/src/DSi_I2C.cpp
+++ b/src/DSi_I2C.cpp
@@ -21,6 +21,7 @@
 #include "DSi.h"
 #include "DSi_I2C.h"
 #include "DSi_Camera.h"
+#include "ARM.h"
 
 
 namespace DSi_BPTWL
@@ -108,7 +109,8 @@ void Write(u8 val, bool last)
         printf("BPTWL: soft-reset\n");
         val = 0; // checkme
         // TODO: soft-reset might need to be scheduled later!
-        DSi::SoftReset();
+        // TODO: this has been moved for the JIT to work, nothing is confirmed here
+        NDS::ARM7->Halt(4);
         CurPos = -1;
         return;
     }
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 3d65482..6981a42 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -32,8 +32,11 @@
 #include "Wifi.h"
 #include "AREngine.h"
 #include "Platform.h"
+
+#ifdef JIT_ENABLED
 #include "ARMJIT.h"
 #include "ARMJIT_Memory.h"
+#endif
 
 #include "DSi.h"
 #include "DSi_SPI_TSC.h"
@@ -173,7 +176,7 @@ bool Init()
 #ifdef JIT_ENABLED
     ARMJIT::Init();
 #else
-    MainRAM = new u8[MainRAMSize];
+    MainRAM = new u8[0x1000000];
     ARM7WRAM = new u8[ARM7WRAMSize];
     SharedWRAM = new u8[SharedWRAMSize];
 #endif
@@ -1837,7 +1840,7 @@ u8 ARM9Read8(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u8*)&MainRAM[addr & (MainRAMSize - 1)];
+        return *(u8*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
@@ -1902,7 +1905,7 @@ u16 ARM9Read16(u32 addr)
     switch (addr & 0xFF000000)
     {
     case 0x02000000:
-        return *(u16*)&MainRAM[addr & (MainRAMSize - 1)];
+        return *(u16*)&MainRAM[addr & MainRAMMask];
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
@@ -2031,16 +2034,13 @@ void ARM9Write8(u32 addr, u8 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2090,16 +2090,13 @@ void ARM9Write16(u32 addr, u16 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2168,16 +2165,13 @@ void ARM9Write32(u32 addr, u32 val)
         ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return ;
 
     case 0x03000000:
         if (SWRAM_ARM9.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val;
         }
@@ -2235,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val)
         return;
     }
 
-    printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
+    //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]);
 }
 
 bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region)
@@ -2475,16 +2469,13 @@ void ARM7Write8(u32 addr, u8 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u8*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
@@ -2552,16 +2543,13 @@ void ARM7Write16(u32 addr, u16 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u16*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
@@ -2639,16 +2627,13 @@ void ARM7Write32(u32 addr, u32 val)
         ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr);
 #endif
         *(u32*)&MainRAM[addr & MainRAMMask] = val;
-#ifdef JIT_ENABLED
-        ARMJIT::InvalidateMainRAMIfNecessary(addr);
-#endif
         return;
 
     case 0x03000000:
         if (SWRAM_ARM7.Mem)
         {
 #ifdef JIT_ENABLED
-            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr);
+            ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr);
 #endif
             *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val;
             return;
diff --git a/src/NDS.h b/src/NDS.h
index 4b4f9a1..e0a5045 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -165,6 +165,8 @@ extern u16 ARM7BIOSProt;
 extern u8* MainRAM;
 extern u32 MainRAMMask;
 
+const u32 MainRAMMaxSize = 0x1000000;
+
 const u32 SharedWRAMSize = 0x8000;
 extern u8* SharedWRAM;
 
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
index 09faf4e..9ee7b9a 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp
@@ -32,6 +32,7 @@
 EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr;
 
 extern char* EmuDirectory;
+extern bool RunningSomething;
 
 
 EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog)
@@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new
     ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType);
 
     ui->chkDirectBoot->setChecked(Config::DirectBoot != 0);
+
+#ifdef JIT_ENABLED
+    ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0);
+    ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0);
+    ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0);
+    ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0);
+    ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize);
+#else
+    ui->chkEnableJIT->setDisabled(true);
+    ui->chkJITBranchOptimisations->setDisabled(true);
+    ui->chkJITLiteralOptimisations->setDisabled(true);
+    ui->chkJITFastMemory->setDisabled(true);
+    ui->spnJITMaximumBlockSize->setDisabled(true);
+#endif
+
+    on_chkEnableJIT_toggled();
 }
 
 EmuSettingsDialog::~EmuSettingsDialog()
@@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware()
     }
 }
 
-void EmuSettingsDialog::on_EmuSettingsDialog_accepted()
+void EmuSettingsDialog::done(int r)
 {
-    verifyFirmware();
-
-    strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0';
-    strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0';
-    strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0';
-
-    strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
-    strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
-    strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
-    strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
-
-    Config::ConsoleType = ui->cbxConsoleType->currentIndex();
-    Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0;
-
-    Config::Save();
+    if (r == QDialog::Accepted)
+    {
+        verifyFirmware();
+
+        int consoleType = ui->cbxConsoleType->currentIndex();
+        int directBoot = ui->chkDirectBoot->isChecked() ? 1:0;
+
+        int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0;
+        int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value();
+        int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0;
+        int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0;
+        int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0;
+
+        std::string bios9Path = ui->txtBIOS9Path->text().toStdString();
+        std::string bios7Path = ui->txtBIOS7Path->text().toStdString();
+        std::string firmwarePath = ui->txtFirmwarePath->text().toStdString();
+        std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString();
+        std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString();
+        std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString();
+        std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString();
+
+        if (consoleType != Config::ConsoleType
+            || directBoot != Config::DirectBoot
+#ifdef JIT_ENABLED
+            || jitEnable != Config::JIT_Enable
+            || jitMaxBlockSize != Config::JIT_MaxBlockSize
+            || jitBranchOptimisations != Config::JIT_BranchOptimisations
+            || jitLiteralOptimisations != Config::JIT_LiteralOptimisations
+            || jitFastMemory != Config::JIT_FastMemory
+#endif
+            || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0
+            || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0
+            || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0
+            || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0
+            || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0
+            || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0
+            || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0)
+        {
+            if (RunningSomething
+                && QMessageBox::warning(this, "Reset necessary to apply changes", 
+                    "The emulation will be reset for the changes to take place", 
+                    QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes)
+                return;
+
+            strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0';
+            strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0';
+            strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0';
+
+            strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0';
+            strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0';
+            strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0';
+            strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0';
+
+    #ifdef JIT_ENABLED
+            Config::JIT_Enable = jitEnable;
+            Config::JIT_MaxBlockSize = jitMaxBlockSize;
+            Config::JIT_BranchOptimisations = jitBranchOptimisations;
+            Config::JIT_LiteralOptimisations = jitLiteralOptimisations;
+            Config::JIT_FastMemory = jitFastMemory;
+    #endif
+
+            Config::ConsoleType = consoleType;
+            Config::DirectBoot = directBoot;
+
+            Config::Save();
+        }
+    }
 
-    closeDlg();
-}
+    QDialog::done(r);
 
-void EmuSettingsDialog::on_EmuSettingsDialog_rejected()
-{
     closeDlg();
 }
 
@@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked()
 
     ui->txtDSiNANDPath->setText(file);
 }
+
+void EmuSettingsDialog::on_chkEnableJIT_toggled()
+{
+    bool disabled = !ui->chkEnableJIT->isChecked();
+    ui->chkJITBranchOptimisations->setDisabled(disabled);
+    ui->chkJITLiteralOptimisations->setDisabled(disabled);
+    ui->chkJITFastMemory->setDisabled(disabled);
+    ui->spnJITMaximumBlockSize->setDisabled(disabled);
+}
\ No newline at end of file
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h
index f604ba5..268036c 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.h
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.h
@@ -51,8 +51,7 @@ public:
     }
 
 private slots:
-    void on_EmuSettingsDialog_accepted();
-    void on_EmuSettingsDialog_rejected();
+    void done(int r);
 
     void on_btnBIOS9Browse_clicked();
     void on_btnBIOS7Browse_clicked();
@@ -63,6 +62,8 @@ private slots:
     void on_btnDSiFirmwareBrowse_clicked();
     void on_btnDSiNANDBrowse_clicked();
 
+    void on_chkEnableJIT_toggled();
+
 private:
     void verifyFirmware();
 
diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui
index 4894fa5..11d48cc 100644
--- a/src/frontend/qt_sdl/EmuSettingsDialog.ui
+++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>490</width>
-    <height>392</height>
+    <width>514</width>
+    <height>359</height>
    </rect>
   </property>
   <property name="sizePolicy">
@@ -24,243 +24,336 @@
     <enum>QLayout::SetFixedSize</enum>
    </property>
    <item>
-    <widget class="QGroupBox" name="groupBox">
-     <property name="title">
-      <string>DS mode</string>
+    <widget class="QTabWidget" name="tabWidget">
+     <property name="currentIndex">
+      <number>0</number>
      </property>
-     <layout class="QGridLayout" name="gridLayout_2">
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="minimumSize">
-         <size>
-          <width>290</width>
-          <height>0</height>
-         </size>
-        </property>
-        <property name="statusTip">
-         <string/>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_3">
-        <property name="text">
-         <string>DS firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_2">
-        <property name="text">
-         <string>DS ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label">
-        <property name="text">
-         <string>DS ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnBIOS9Browse">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-        <property name="autoDefault">
-         <bool>true</bool>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_3">
-     <property name="title">
-      <string>DSi mode</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout_3">
-      <item row="0" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS9Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_5">
-        <property name="text">
-         <string>DSi ARM9 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="2">
-       <widget class="QPushButton" name="btnDSiFirmwareBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS7Path">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="1">
-       <widget class="QLineEdit" name="txtDSiFirmwarePath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0">
-       <widget class="QLabel" name="label_6">
-        <property name="text">
-         <string>DSi ARM7 BIOS:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="2" column="0">
-       <widget class="QLabel" name="label_7">
-        <property name="text">
-         <string>DSi firmware:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="2">
-       <widget class="QPushButton" name="btnDSiBIOS7Browse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QLineEdit" name="txtDSiBIOS9Path">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0">
-       <widget class="QLabel" name="label_8">
-        <property name="text">
-         <string>DSi NAND:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="1">
-       <widget class="QLineEdit" name="txtDSiNANDPath">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="2">
-       <widget class="QPushButton" name="btnDSiNANDBrowse">
-        <property name="text">
-         <string>Browse...</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>General</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout">
-      <item row="0" column="0">
-       <widget class="QLabel" name="label_4">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="text">
-         <string>Console type:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="0" column="1">
-       <widget class="QComboBox" name="cbxConsoleType">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-       </widget>
-      </item>
-      <item row="1" column="0" colspan="2">
-       <widget class="QCheckBox" name="chkDirectBoot">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-        <property name="text">
-         <string>Boot game directly</string>
-        </property>
-       </widget>
-      </item>
-     </layout>
+     <widget class="QWidget" name="tab">
+      <attribute name="title">
+       <string>General</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_4">
+       <item row="1" column="1">
+        <widget class="QComboBox" name="cbxConsoleType">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The type of console to emulate&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="1">
+        <widget class="QCheckBox" name="chkDirectBoot">
+         <property name="whatsThis">
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When loading a ROM, completely skip the regular boot process (&amp;quot;Nintendo DS&amp;quot; screen) to boot the ROM directly.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+         </property>
+         <property name="text">
+          <string>Boot game directly</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <spacer name="verticalSpacer_2">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_4">
+         <property name="sizePolicy">
+          <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
+           <horstretch>0</horstretch>
+           <verstretch>0</verstretch>
+          </sizepolicy>
+         </property>
+         <property name="text">
+          <string>Console type:</string>
+         </property>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_2">
+      <attribute name="title">
+       <string>BIOS Files</string>
+      </attribute>
+      <layout class="QVBoxLayout" name="verticalLayout_2">
+       <item>
+        <widget class="QGroupBox" name="groupBox">
+         <property name="title">
+          <string>DS mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_2">
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>DS firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode firmware&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Possible firmwares:&lt;/p&gt;&lt;p&gt;* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.&lt;/p&gt;&lt;p&gt;* 256 KB: regular DS firmware.&lt;/p&gt;&lt;p&gt;* 512 KB: iQue DS firmware.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;Size should be 16 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnBIOS9Browse">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+            <property name="autoDefault">
+             <bool>true</bool>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>DS ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>DS ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="minimumSize">
+             <size>
+              <width>290</width>
+              <height>0</height>
+             </size>
+            </property>
+            <property name="statusTip">
+             <string/>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DS-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;Size should be 4 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+       <item>
+        <widget class="QGroupBox" name="groupBox_3">
+         <property name="title">
+          <string>DSi mode</string>
+         </property>
+         <layout class="QGridLayout" name="gridLayout_3">
+          <item row="0" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS9Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="0">
+           <widget class="QLabel" name="label_5">
+            <property name="text">
+             <string>DSi ARM9 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="2">
+           <widget class="QPushButton" name="btnDSiFirmwareBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS7Path">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM7 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="1">
+           <widget class="QLineEdit" name="txtDSiFirmwarePath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode firmware (used for DS-mode backwards compatibility)&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 128 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="0">
+           <widget class="QLabel" name="label_6">
+            <property name="text">
+             <string>DSi ARM7 BIOS:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="2" column="0">
+           <widget class="QLabel" name="label_7">
+            <property name="text">
+             <string>DSi firmware:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="1" column="2">
+           <widget class="QPushButton" name="btnDSiBIOS7Browse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+          <item row="0" column="1">
+           <widget class="QLineEdit" name="txtDSiBIOS9Path">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi-mode ARM9 BIOS&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Size should be 64 KB&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="0">
+           <widget class="QLabel" name="label_8">
+            <property name="text">
+             <string>DSi NAND:</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="1">
+           <widget class="QLineEdit" name="txtDSiNANDPath">
+            <property name="whatsThis">
+             <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;DSi NAND dump&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Should have 'nocash footer' at the end&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+            </property>
+           </widget>
+          </item>
+          <item row="3" column="2">
+           <widget class="QPushButton" name="btnDSiNANDBrowse">
+            <property name="text">
+             <string>Browse...</string>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+     <widget class="QWidget" name="tab_3">
+      <attribute name="title">
+       <string>CPU Emulation</string>
+      </attribute>
+      <layout class="QFormLayout" name="formLayout_5">
+       <item row="0" column="0">
+        <widget class="QCheckBox" name="chkEnableJIT">
+         <property name="text">
+          <string>Enable JIT recompiler</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="0">
+        <widget class="QLabel" name="label_9">
+         <property name="text">
+          <string>Maximum JIT block size:</string>
+         </property>
+        </widget>
+       </item>
+       <item row="1" column="1">
+        <widget class="QSpinBox" name="spnJITMaximumBlockSize">
+         <property name="minimum">
+          <number>1</number>
+         </property>
+         <property name="maximum">
+          <number>32</number>
+         </property>
+         <property name="value">
+          <number>32</number>
+         </property>
+        </widget>
+       </item>
+       <item row="2" column="0">
+        <widget class="QCheckBox" name="chkJITBranchOptimisations">
+         <property name="text">
+          <string>Branch Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="3" column="0">
+        <widget class="QCheckBox" name="chkJITLiteralOptimisations">
+         <property name="text">
+          <string>Literal Optimisations</string>
+         </property>
+        </widget>
+       </item>
+       <item row="4" column="0">
+        <widget class="QCheckBox" name="chkJITFastMemory">
+         <property name="text">
+          <string>Fast Memory</string>
+         </property>
+        </widget>
+       </item>
+       <item row="5" column="0">
+        <spacer name="verticalSpacer">
+         <property name="orientation">
+          <enum>Qt::Vertical</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>20</width>
+           <height>40</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+      </layout>
+     </widget>
     </widget>
    </item>
    <item>
@@ -275,6 +368,27 @@
    </item>
   </layout>
  </widget>
+ <tabstops>
+  <tabstop>tabWidget</tabstop>
+  <tabstop>cbxConsoleType</tabstop>
+  <tabstop>chkDirectBoot</tabstop>
+  <tabstop>txtBIOS9Path</tabstop>
+  <tabstop>txtBIOS7Path</tabstop>
+  <tabstop>txtFirmwarePath</tabstop>
+  <tabstop>txtDSiBIOS9Path</tabstop>
+  <tabstop>txtDSiBIOS7Path</tabstop>
+  <tabstop>txtDSiFirmwarePath</tabstop>
+  <tabstop>txtDSiNANDPath</tabstop>
+  <tabstop>btnBIOS9Browse</tabstop>
+  <tabstop>btnBIOS7Browse</tabstop>
+  <tabstop>btnFirmwareBrowse</tabstop>
+  <tabstop>btnDSiBIOS9Browse</tabstop>
+  <tabstop>btnDSiBIOS7Browse</tabstop>
+  <tabstop>btnDSiFirmwareBrowse</tabstop>
+  <tabstop>btnDSiNANDBrowse</tabstop>
+  <tabstop>chkEnableJIT</tabstop>
+  <tabstop>spnJITMaximumBlockSize</tabstop>
+ </tabstops>
  <resources/>
  <connections>
   <connection>
@@ -284,8 +398,8 @@
    <slot>accept()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>248</x>
-     <y>254</y>
+     <x>257</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>157</x>
@@ -300,8 +414,8 @@
    <slot>reject()</slot>
    <hints>
     <hint type="sourcelabel">
-     <x>316</x>
-     <y>260</y>
+     <x>325</x>
+     <y>349</y>
     </hint>
     <hint type="destinationlabel">
      <x>286</x>
diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp
index fa542ad..4557d0e 100644
--- a/src/frontend/qt_sdl/main.cpp
+++ b/src/frontend/qt_sdl/main.cpp
@@ -1641,7 +1641,14 @@ void MainWindow::onStop()
 
 void MainWindow::onOpenEmuSettings()
 {
-    EmuSettingsDialog::openDlg(this);
+    EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this);
+    connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished);
+}
+
+void MainWindow::onEmuSettingsDialogFinished(int res)
+{
+    if (RunningSomething)
+        onReset();
 }
 
 void MainWindow::onOpenInputConfig()
diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h
index 279aed8..eec2a48 100644
--- a/src/frontend/qt_sdl/main.h
+++ b/src/frontend/qt_sdl/main.h
@@ -199,6 +199,7 @@ private slots:
     void onStop();
 
     void onOpenEmuSettings();
+    void onEmuSettingsDialogFinished(int res);
     void onOpenInputConfig();
     void onInputConfigFinished(int res);
     void onOpenVideoSettings();
diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp
deleted file mode 100644
index 0df9c6c..0000000
--- a/src/libui_sdl/DlgEmuSettings.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
-    Copyright 2016-2020 Arisotura
-
-    This file is part of melonDS.
-
-    melonDS is free software: you can redistribute it and/or modify it under
-    the terms of the GNU General Public License as published by the Free
-    Software Foundation, either version 3 of the License, or (at your option)
-    any later version.
-
-    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
-    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with melonDS. If not, see http://www.gnu.org/licenses/.
-*/
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "libui/ui.h"
-
-#include "../types.h"
-#include "PlatformConfig.h"
-
-#include "DlgEmuSettings.h"
-
-
-void ApplyNewSettings(int type);
-
-extern bool RunningSomething;
-
-namespace DlgEmuSettings
-{
-
-bool opened;
-uiWindow* win;
-
-uiCheckbox* cbDirectBoot;
-
-#ifdef JIT_ENABLED
-uiCheckbox* cbJITEnabled;
-uiEntry* enJITMaxBlockSize;
-uiCheckbox* cbJITBranchOptimisations;
-uiCheckbox* cbJITLiteralOptimisations;
-#endif
-
-int OnCloseWindow(uiWindow* window, void* blarg)
-{
-    opened = false;
-    return 1;
-}
-
-void OnCancel(uiButton* btn, void* blarg)
-{
-    uiControlDestroy(uiControl(win));
-    opened = false;
-}
-
-void OnOk(uiButton* btn, void* blarg)
-{
-#ifdef JIT_ENABLED
-    bool restart = false;
-
-    bool enableJit = uiCheckboxChecked(cbJITEnabled);
-    char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize);
-    long blockSize = strtol(maxBlockSizeStr, NULL, 10);
-    bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations);
-    bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations);
-    uiFreeText(maxBlockSizeStr);
-    if (blockSize < 1)
-        blockSize = 1;
-    if (blockSize > 32)
-        blockSize = 32;
-
-    if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize
-        || branchOptimisations != Config::JIT_BrancheOptimisations
-        || literalOptimisations != Config::JIT_LiteralOptimisations)
-    {
-        if (RunningSomething && 
-            !uiMsgBoxConfirm(win, "Reset emulator", 
-                "Changing JIT settings requires a reset.\n\nDo you want to continue?"))
-            return;
-
-        Config::JIT_Enable = enableJit;
-        Config::JIT_MaxBlockSize = blockSize;
-        Config::JIT_BrancheOptimisations = branchOptimisations;
-        Config::JIT_LiteralOptimisations = literalOptimisations;
-
-        restart = true;
-    }
-#endif
-
-    Config::DirectBoot = uiCheckboxChecked(cbDirectBoot);
-
-    Config::Save();
-
-    uiControlDestroy(uiControl(win));
-    opened = false;
-
-#ifdef JIT_ENABLED
-    if (restart)
-        ApplyNewSettings(4);
-#endif
-}
-
-#ifdef JIT_ENABLED
-void OnJITStateChanged(uiCheckbox* cb, void* blarg)
-{
-    if (uiCheckboxChecked(cb))
-    {
-        uiControlEnable(uiControl(enJITMaxBlockSize));
-        uiControlEnable(uiControl(cbJITBranchOptimisations));
-        uiControlEnable(uiControl(cbJITLiteralOptimisations));
-    }
-    else
-    {
-        uiControlDisable(uiControl(enJITMaxBlockSize));
-        uiControlDisable(uiControl(cbJITBranchOptimisations));
-        uiControlDisable(uiControl(cbJITLiteralOptimisations));
-    }
-}
-#endif
-
-void Open()
-{
-    if (opened)
-    {
-        uiControlSetFocus(uiControl(win));
-        return;
-    }
-
-    opened = true;
-    win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0);
-    uiWindowSetMargined(win, 1);
-    uiWindowOnClosing(win, OnCloseWindow, NULL);
-
-    uiBox* top = uiNewVerticalBox();
-    uiWindowSetChild(win, uiControl(top));
-
-    {
-        uiBox* in_ctrl = uiNewVerticalBox();
-        uiBoxAppend(top, uiControl(in_ctrl), 0);
-
-        cbDirectBoot = uiNewCheckbox("Boot game directly");
-        uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0);
-    }
-
-#ifdef JIT_ENABLED
-    {
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(top, uiControl(dummy), 0);
-    }
-
-    {
-        uiGroup* grp = uiNewGroup("JIT");
-        uiBoxAppend(top, uiControl(grp), 1);
-
-        uiBox* in_ctrl = uiNewVerticalBox();
-        uiGroupSetChild(grp, uiControl(in_ctrl));
-
-        cbJITEnabled = uiNewCheckbox("Enable JIT recompiler");
-        uiBoxAppend(in_ctrl, uiControl(cbJITEnabled), 0);
-
-        uiCheckboxOnToggled(cbJITEnabled, OnJITStateChanged, NULL);
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            uiLabel* lbl = uiNewLabel("Maximum block size (1-32): ");
-            uiBoxAppend(row, uiControl(lbl), 0);
-
-            enJITMaxBlockSize = uiNewEntry();
-            uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:");
-            uiBoxAppend(row, uiControl(lbl), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations");
-            uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0);
-        }
-
-        {
-            uiBox* row = uiNewHorizontalBox();
-            uiBoxAppend(in_ctrl, uiControl(row), 0);
-
-            cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations");
-            uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0);
-        }
-    }
-#endif
-
-    {
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(top, uiControl(dummy), 0);
-    }
-
-    {
-        uiBox* in_ctrl = uiNewHorizontalBox();
-        uiBoxSetPadded(in_ctrl, 1);
-        uiBoxAppend(top, uiControl(in_ctrl), 0);
-
-        uiLabel* dummy = uiNewLabel("");
-        uiBoxAppend(in_ctrl, uiControl(dummy), 1);
-
-        uiButton* btncancel = uiNewButton("Cancel");
-        uiButtonOnClicked(btncancel, OnCancel, NULL);
-        uiBoxAppend(in_ctrl, uiControl(btncancel), 0);
-
-        uiButton* btnok = uiNewButton("Ok");
-        uiButtonOnClicked(btnok, OnOk, NULL);
-        uiBoxAppend(in_ctrl, uiControl(btnok), 0);
-    }
-
-    uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot);
-
-#ifdef JIT_ENABLED
-    uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable);
-    {
-        char maxBlockSizeStr[10];
-        sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize);
-        uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr);
-    }
-    OnJITStateChanged(cbJITEnabled, NULL);
-
-    uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations);
-    uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations);
-#endif
-
-    uiControlShow(uiControl(win));
-}
-
-void Close()
-{
-    if (!opened) return;
-    uiControlDestroy(uiControl(win));
-    opened = false;
-}
-
-}
diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h
deleted file mode 100644
index e45fe91..0000000
--- a/src/libui_sdl/libui/ui.h
+++ /dev/null
@@ -1,764 +0,0 @@
-// 6 april 2015
-
-// TODO add a uiVerifyControlType() function that can be used by control implementations to verify controls
-
-#ifndef __LIBUI_UI_H__
-#define __LIBUI_UI_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// this macro is generated by cmake
-#ifdef libui_EXPORTS
-#ifdef _WIN32
-#define _UI_EXTERN __declspec(dllexport) extern
-#else
-#define _UI_EXTERN __attribute__((visibility("default"))) extern
-#endif
-#else
-// TODO add __declspec(dllimport) on windows, but only if not static
-#define _UI_EXTERN extern
-#endif
-
-// C++ is really really really really really really dumb about enums, so screw that and just make them anonymous
-// This has the advantage of being ABI-able should we ever need an ABI...
-#define _UI_ENUM(s) typedef unsigned int s; enum
-
-// This constant is provided because M_PI is nonstandard.
-// This comes from Go's math.Pi, which in turn comes from http://oeis.org/A000796.
-#define uiPi 3.14159265358979323846264338327950288419716939937510582097494459
-
-// TODO uiBool?
-
-typedef struct uiInitOptions uiInitOptions;
-
-struct uiInitOptions {
-	size_t Size;
-};
-
-_UI_EXTERN const char *uiInit(uiInitOptions *options);
-_UI_EXTERN void uiUninit(void);
-_UI_EXTERN void uiFreeInitError(const char *err);
-
-_UI_EXTERN void uiMain(void);
-_UI_EXTERN void uiMainSteps(void);
-_UI_EXTERN int uiMainStep(int wait);
-_UI_EXTERN void uiQuit(void);
-
-_UI_EXTERN void uiQueueMain(void (*f)(void *data), void *data);
-
-_UI_EXTERN void uiOnShouldQuit(int (*f)(void *data), void *data);
-
-_UI_EXTERN void uiFreeText(char *text);
-
-typedef struct uiControl uiControl;
-
-struct uiControl {
-	uint32_t Signature;
-	uint32_t OSSignature;
-	uint32_t TypeSignature;
-	void (*Destroy)(uiControl *);
-	uintptr_t (*Handle)(uiControl *);
-	uiControl *(*Parent)(uiControl *);
-	void (*SetParent)(uiControl *, uiControl *);
-	int (*Toplevel)(uiControl *);
-	int (*Visible)(uiControl *);
-	void (*Show)(uiControl *);
-	void (*Hide)(uiControl *);
-	int (*Enabled)(uiControl *);
-	void (*Enable)(uiControl *);
-	void (*Disable)(uiControl *);
-	void (*SetFocus)(uiControl *);
-	void (*SetMinSize)(uiControl*, int, int);
-
-	int MinWidth, MinHeight;
-
-	void* UserData;
-};
-// TOOD add argument names to all arguments
-#define uiControl(this) ((uiControl *) (this))
-_UI_EXTERN void uiControlDestroy(uiControl *);
-_UI_EXTERN uintptr_t uiControlHandle(uiControl *);
-_UI_EXTERN uiControl *uiControlParent(uiControl *);
-_UI_EXTERN void uiControlSetParent(uiControl *, uiControl *);
-_UI_EXTERN int uiControlToplevel(uiControl *);
-_UI_EXTERN int uiControlVisible(uiControl *);
-_UI_EXTERN void uiControlShow(uiControl *);
-_UI_EXTERN void uiControlHide(uiControl *);
-_UI_EXTERN int uiControlEnabled(uiControl *);
-_UI_EXTERN void uiControlEnable(uiControl *);
-_UI_EXTERN void uiControlDisable(uiControl *);
-_UI_EXTERN void uiControlSetFocus(uiControl *);
-_UI_EXTERN void uiControlSetMinSize(uiControl *, int w, int h); // -1 = no minimum
-
-_UI_EXTERN uiControl *uiAllocControl(size_t n, uint32_t OSsig, uint32_t typesig, const char *typenamestr);
-_UI_EXTERN void uiFreeControl(uiControl *);
-
-// TODO make sure all controls have these
-_UI_EXTERN void uiControlVerifySetParent(uiControl *, uiControl *);
-_UI_EXTERN int uiControlEnabledToUser(uiControl *);
-
-_UI_EXTERN void uiUserBugCannotSetParentOnToplevel(const char *type);
-
-typedef struct uiWindow uiWindow;
-#define uiWindow(this) ((uiWindow *) (this))
-_UI_EXTERN char *uiWindowTitle(uiWindow *w);
-_UI_EXTERN void uiWindowSetTitle(uiWindow *w, const char *title);
-_UI_EXTERN void uiWindowPosition(uiWindow *w, int *x, int *y);
-_UI_EXTERN void uiWindowSetPosition(uiWindow *w, int x, int y);
-_UI_EXTERN void uiWindowContentSize(uiWindow *w, int *width, int *height);
-_UI_EXTERN void uiWindowSetContentSize(uiWindow *w, int width, int height);
-_UI_EXTERN int uiWindowMinimized(uiWindow *w);
-_UI_EXTERN void uiWindowSetMinimized(uiWindow *w, int minimized);
-_UI_EXTERN int uiWindowMaximized(uiWindow *w);
-_UI_EXTERN void uiWindowSetMaximized(uiWindow *w, int maximized);
-_UI_EXTERN int uiWindowFullscreen(uiWindow *w);
-_UI_EXTERN void uiWindowSetFullscreen(uiWindow *w, int fullscreen);
-_UI_EXTERN int uiWindowBorderless(uiWindow *w);
-_UI_EXTERN void uiWindowSetBorderless(uiWindow *w, int borderless);
-_UI_EXTERN void uiWindowSetChild(uiWindow *w, uiControl *child);
-_UI_EXTERN int uiWindowMargined(uiWindow *w);
-_UI_EXTERN void uiWindowSetMargined(uiWindow *w, int margined);
-_UI_EXTERN void uiWindowSetDropTarget(uiWindow* w, int drop);
-_UI_EXTERN uiWindow *uiNewWindow(const char *title, int width, int height, int maximized, int hasMenubar, int resizable);
-
-_UI_EXTERN void uiWindowOnContentSizeChanged(uiWindow *w, void (*f)(uiWindow *, void *), void *data);
-_UI_EXTERN void uiWindowOnClosing(uiWindow *w, int (*f)(uiWindow *w, void *data), void *data);
-_UI_EXTERN void uiWindowOnDropFile(uiWindow *w, void (*f)(uiWindow *w, char *file, void *data), void *data);
-_UI_EXTERN void uiWindowOnGetFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data);
-_UI_EXTERN void uiWindowOnLoseFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data);
-
-typedef struct uiButton uiButton;
-#define uiButton(this) ((uiButton *) (this))
-_UI_EXTERN char *uiButtonText(uiButton *b);
-_UI_EXTERN void uiButtonSetText(uiButton *b, const char *text);
-_UI_EXTERN void uiButtonOnClicked(uiButton *b, void (*f)(uiButton *b, void *data), void *data);
-_UI_EXTERN uiButton *uiNewButton(const char *text);
-
-typedef struct uiBox uiBox;
-#define uiBox(this) ((uiBox *) (this))
-_UI_EXTERN void uiBoxAppend(uiBox *b, uiControl *child, int stretchy);
-_UI_EXTERN void uiBoxDelete(uiBox *b, int index);
-_UI_EXTERN int uiBoxPadded(uiBox *b);
-_UI_EXTERN void uiBoxSetPadded(uiBox *b, int padded);
-_UI_EXTERN uiBox *uiNewHorizontalBox(void);
-_UI_EXTERN uiBox *uiNewVerticalBox(void);
-
-typedef struct uiCheckbox uiCheckbox;
-#define uiCheckbox(this) ((uiCheckbox *) (this))
-_UI_EXTERN char *uiCheckboxText(uiCheckbox *c);
-_UI_EXTERN void uiCheckboxSetText(uiCheckbox *c, const char *text);
-_UI_EXTERN void uiCheckboxOnToggled(uiCheckbox *c, void (*f)(uiCheckbox *c, void *data), void *data);
-_UI_EXTERN int uiCheckboxChecked(uiCheckbox *c);
-_UI_EXTERN void uiCheckboxSetChecked(uiCheckbox *c, int checked);
-_UI_EXTERN uiCheckbox *uiNewCheckbox(const char *text);
-
-typedef struct uiEntry uiEntry;
-#define uiEntry(this) ((uiEntry *) (this))
-_UI_EXTERN char *uiEntryText(uiEntry *e);
-_UI_EXTERN void uiEntrySetText(uiEntry *e, const char *text);
-_UI_EXTERN void uiEntryOnChanged(uiEntry *e, void (*f)(uiEntry *e, void *data), void *data);
-_UI_EXTERN int uiEntryReadOnly(uiEntry *e);
-_UI_EXTERN void uiEntrySetReadOnly(uiEntry *e, int readonly);
-_UI_EXTERN uiEntry *uiNewEntry(void);
-_UI_EXTERN uiEntry *uiNewPasswordEntry(void);
-_UI_EXTERN uiEntry *uiNewSearchEntry(void);
-
-typedef struct uiLabel uiLabel;
-#define uiLabel(this) ((uiLabel *) (this))
-_UI_EXTERN char *uiLabelText(uiLabel *l);
-_UI_EXTERN void uiLabelSetText(uiLabel *l, const char *text);
-_UI_EXTERN uiLabel *uiNewLabel(const char *text);
-
-typedef struct uiTab uiTab;
-#define uiTab(this) ((uiTab *) (this))
-_UI_EXTERN void uiTabAppend(uiTab *t, const char *name, uiControl *c);
-_UI_EXTERN void uiTabInsertAt(uiTab *t, const char *name, int before, uiControl *c);
-_UI_EXTERN void uiTabDelete(uiTab *t, int index);
-_UI_EXTERN int uiTabNumPages(uiTab *t);
-_UI_EXTERN int uiTabMargined(uiTab *t, int page);
-_UI_EXTERN void uiTabSetMargined(uiTab *t, int page, int margined);
-_UI_EXTERN uiTab *uiNewTab(void);
-
-typedef struct uiGroup uiGroup;
-#define uiGroup(this) ((uiGroup *) (this))
-_UI_EXTERN char *uiGroupTitle(uiGroup *g);
-_UI_EXTERN void uiGroupSetTitle(uiGroup *g, const char *title);
-_UI_EXTERN void uiGroupSetChild(uiGroup *g, uiControl *c);
-_UI_EXTERN int uiGroupMargined(uiGroup *g);
-_UI_EXTERN void uiGroupSetMargined(uiGroup *g, int margined);
-_UI_EXTERN uiGroup *uiNewGroup(const char *title);
-
-// spinbox/slider rules:
-// setting value outside of range will automatically clamp
-// initial value is minimum
-// complaint if min >= max?
-
-typedef struct uiSpinbox uiSpinbox;
-#define uiSpinbox(this) ((uiSpinbox *) (this))
-_UI_EXTERN int uiSpinboxValue(uiSpinbox *s);
-_UI_EXTERN void uiSpinboxSetValue(uiSpinbox *s, int value);
-_UI_EXTERN void uiSpinboxOnChanged(uiSpinbox *s, void (*f)(uiSpinbox *s, void *data), void *data);
-_UI_EXTERN uiSpinbox *uiNewSpinbox(int min, int max);
-
-typedef struct uiSlider uiSlider;
-#define uiSlider(this) ((uiSlider *) (this))
-_UI_EXTERN int uiSliderValue(uiSlider *s);
-_UI_EXTERN void uiSliderSetValue(uiSlider *s, int value);
-_UI_EXTERN void uiSliderOnChanged(uiSlider *s, void (*f)(uiSlider *s, void *data), void *data);
-_UI_EXTERN uiSlider *uiNewSlider(int min, int max);
-
-typedef struct uiProgressBar uiProgressBar;
-#define uiProgressBar(this) ((uiProgressBar *) (this))
-_UI_EXTERN int uiProgressBarValue(uiProgressBar *p);
-_UI_EXTERN void uiProgressBarSetValue(uiProgressBar *p, int n);
-_UI_EXTERN uiProgressBar *uiNewProgressBar(void);
-
-typedef struct uiSeparator uiSeparator;
-#define uiSeparator(this) ((uiSeparator *) (this))
-_UI_EXTERN uiSeparator *uiNewHorizontalSeparator(void);
-_UI_EXTERN uiSeparator *uiNewVerticalSeparator(void);
-
-typedef struct uiCombobox uiCombobox;
-#define uiCombobox(this) ((uiCombobox *) (this))
-_UI_EXTERN void uiComboboxAppend(uiCombobox *c, const char *text);
-_UI_EXTERN int uiComboboxSelected(uiCombobox *c);
-_UI_EXTERN void uiComboboxSetSelected(uiCombobox *c, int n);
-_UI_EXTERN void uiComboboxOnSelected(uiCombobox *c, void (*f)(uiCombobox *c, void *data), void *data);
-_UI_EXTERN uiCombobox *uiNewCombobox(void);
-
-typedef struct uiEditableCombobox uiEditableCombobox;
-#define uiEditableCombobox(this) ((uiEditableCombobox *) (this))
-_UI_EXTERN void uiEditableComboboxAppend(uiEditableCombobox *c, const char *text);
-_UI_EXTERN char *uiEditableComboboxText(uiEditableCombobox *c);
-_UI_EXTERN void uiEditableComboboxSetText(uiEditableCombobox *c, const char *text);
-// TODO what do we call a function that sets the currently selected item and fills the text field with it? editable comboboxes have no consistent concept of selected item
-_UI_EXTERN void uiEditableComboboxOnChanged(uiEditableCombobox *c, void (*f)(uiEditableCombobox *c, void *data), void *data);
-_UI_EXTERN uiEditableCombobox *uiNewEditableCombobox(void);
-
-typedef struct uiRadioButtons uiRadioButtons;
-#define uiRadioButtons(this) ((uiRadioButtons *) (this))
-_UI_EXTERN void uiRadioButtonsAppend(uiRadioButtons *r, const char *text);
-_UI_EXTERN int uiRadioButtonsSelected(uiRadioButtons *r);
-_UI_EXTERN void uiRadioButtonsSetSelected(uiRadioButtons *r, int n);
-_UI_EXTERN void uiRadioButtonsOnSelected(uiRadioButtons *r, void (*f)(uiRadioButtons *, void *), void *data);
-_UI_EXTERN uiRadioButtons *uiNewRadioButtons(void);
-
-typedef struct uiDateTimePicker uiDateTimePicker;
-#define uiDateTimePicker(this) ((uiDateTimePicker *) (this))
-_UI_EXTERN uiDateTimePicker *uiNewDateTimePicker(void);
-_UI_EXTERN uiDateTimePicker *uiNewDatePicker(void);
-_UI_EXTERN uiDateTimePicker *uiNewTimePicker(void);
-
-// TODO provide a facility for entering tab stops?
-typedef struct uiMultilineEntry uiMultilineEntry;
-#define uiMultilineEntry(this) ((uiMultilineEntry *) (this))
-_UI_EXTERN char *uiMultilineEntryText(uiMultilineEntry *e);
-_UI_EXTERN void uiMultilineEntrySetText(uiMultilineEntry *e, const char *text);
-_UI_EXTERN void uiMultilineEntryAppend(uiMultilineEntry *e, const char *text);
-_UI_EXTERN void uiMultilineEntryOnChanged(uiMultilineEntry *e, void (*f)(uiMultilineEntry *e, void *data), void *data);
-_UI_EXTERN int uiMultilineEntryReadOnly(uiMultilineEntry *e);
-_UI_EXTERN void uiMultilineEntrySetReadOnly(uiMultilineEntry *e, int readonly);
-_UI_EXTERN uiMultilineEntry *uiNewMultilineEntry(void);
-_UI_EXTERN uiMultilineEntry *uiNewNonWrappingMultilineEntry(void);
-
-typedef struct uiMenuItem uiMenuItem;
-#define uiMenuItem(this) ((uiMenuItem *) (this))
-_UI_EXTERN void uiMenuItemEnable(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemDisable(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemOnClicked(uiMenuItem *m, void (*f)(uiMenuItem *sender, uiWindow *window, void *data), void *data);
-_UI_EXTERN int uiMenuItemChecked(uiMenuItem *m);
-_UI_EXTERN void uiMenuItemSetChecked(uiMenuItem *m, int checked);
-
-typedef struct uiMenu uiMenu;
-#define uiMenu(this) ((uiMenu *) (this))
-_UI_EXTERN uiMenuItem *uiMenuAppendItem(uiMenu *m, const char *name);
-_UI_EXTERN uiMenuItem *uiMenuAppendCheckItem(uiMenu *m, const char *name);
-_UI_EXTERN uiMenuItem *uiMenuAppendQuitItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendPreferencesItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendAboutItem(uiMenu *m);
-_UI_EXTERN uiMenuItem *uiMenuAppendSubmenu(uiMenu *m, uiMenu* child);
-_UI_EXTERN void uiMenuAppendSeparator(uiMenu *m);
-_UI_EXTERN uiMenu *uiNewMenu(const char *name);
-
-_UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath);
-_UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath);
-_UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description);
-_UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description);
-_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description);
-
-typedef struct uiArea uiArea;
-typedef struct uiAreaHandler uiAreaHandler;
-typedef struct uiAreaDrawParams uiAreaDrawParams;
-typedef struct uiAreaMouseEvent uiAreaMouseEvent;
-typedef struct uiAreaKeyEvent uiAreaKeyEvent;
-
-typedef struct uiDrawContext uiDrawContext;
-
-// TO CONSIDER: the uiAreaHandler param there seems useless
-// (might use individual callbacks instead of handler struct?)
-struct uiAreaHandler {
-	void (*Draw)(uiAreaHandler *, uiArea *, uiAreaDrawParams *);
-	// TODO document that resizes cause a full redraw for non-scrolling areas; implementation-defined for scrolling areas
-	void (*MouseEvent)(uiAreaHandler *, uiArea *, uiAreaMouseEvent *);
-	// TODO document that on first show if the mouse is already in the uiArea then one gets sent with left=0
-	// TODO what about when the area is hidden and then shown again?
-	void (*MouseCrossed)(uiAreaHandler *, uiArea *, int left);
-	void (*DragBroken)(uiAreaHandler *, uiArea *);
-	int (*KeyEvent)(uiAreaHandler *, uiArea *, uiAreaKeyEvent *);
-	void (*Resize)(uiAreaHandler *, uiArea *, int, int);
-};
-
-// TODO RTL layouts?
-// TODO reconcile edge and corner naming
-_UI_ENUM(uiWindowResizeEdge) {
-	uiWindowResizeEdgeLeft,
-	uiWindowResizeEdgeTop,
-	uiWindowResizeEdgeRight,
-	uiWindowResizeEdgeBottom,
-	uiWindowResizeEdgeTopLeft,
-	uiWindowResizeEdgeTopRight,
-	uiWindowResizeEdgeBottomLeft,
-	uiWindowResizeEdgeBottomRight,
-	// TODO have one for keyboard resizes?
-	// TODO GDK doesn't seem to have any others, including for keyboards...
-	// TODO way to bring up the system menu instead?
-};
-
-#define uiGLVersion(major, minor)  ((major) | ((minor)<<16))
-#define uiGLVerMajor(ver)          ((ver) & 0xFFFF)
-#define uiGLVerMinor(ver)          ((ver) >> 16)
-
-#define uiArea(this) ((uiArea *) (this))
-// TODO give a better name
-// TODO document the types of width and height
-_UI_EXTERN void uiAreaSetSize(uiArea *a, int width, int height);
-// TODO uiAreaQueueRedraw()
-_UI_EXTERN void uiAreaQueueRedrawAll(uiArea *a);
-_UI_EXTERN void uiAreaScrollTo(uiArea *a, double x, double y, double width, double height);
-// TODO document these can only be called within Mouse() handlers
-// TODO should these be allowed on scrolling areas?
-// TODO decide which mouse events should be accepted; Down is the only one guaranteed to work right now
-// TODO what happens to events after calling this up to and including the next mouse up?
-// TODO release capture?
-_UI_EXTERN void uiAreaBeginUserWindowMove(uiArea *a);
-_UI_EXTERN void uiAreaBeginUserWindowResize(uiArea *a, uiWindowResizeEdge edge);
-_UI_EXTERN void uiAreaSetBackgroundColor(uiArea *a, int r, int g, int b);
-_UI_EXTERN uiArea *uiNewArea(uiAreaHandler *ah);
-_UI_EXTERN uiArea *uiNewGLArea(uiAreaHandler *ah, const unsigned int* req_versions);
-_UI_EXTERN uiArea *uiNewScrollingArea(uiAreaHandler *ah, int width, int height);
-
-struct uiAreaDrawParams {
-	uiDrawContext *Context;
-
-	// TODO document that this is only defined for nonscrolling areas
-	double AreaWidth;
-	double AreaHeight;
-
-	double ClipX;
-	double ClipY;
-	double ClipWidth;
-	double ClipHeight;
-};
-
-typedef struct uiDrawPath uiDrawPath;
-typedef struct uiDrawBrush uiDrawBrush;
-typedef struct uiDrawStrokeParams uiDrawStrokeParams;
-typedef struct uiDrawMatrix uiDrawMatrix;
-
-typedef struct uiDrawBrushGradientStop uiDrawBrushGradientStop;
-
-typedef struct uiDrawBitmap uiDrawBitmap;
-
-_UI_ENUM(uiDrawBrushType) {
-	uiDrawBrushTypeSolid,
-	uiDrawBrushTypeLinearGradient,
-	uiDrawBrushTypeRadialGradient,
-	uiDrawBrushTypeImage,
-};
-
-_UI_ENUM(uiDrawLineCap) {
-	uiDrawLineCapFlat,
-	uiDrawLineCapRound,
-	uiDrawLineCapSquare,
-};
-
-_UI_ENUM(uiDrawLineJoin) {
-	uiDrawLineJoinMiter,
-	uiDrawLineJoinRound,
-	uiDrawLineJoinBevel,
-};
-
-// this is the default for botoh cairo and Direct2D (in the latter case, from the C++ helper functions)
-// Core Graphics doesn't explicitly specify a default, but NSBezierPath allows you to choose one, and this is the initial value
-// so we're good to use it too!
-#define uiDrawDefaultMiterLimit 10.0
-
-_UI_ENUM(uiDrawFillMode) {
-	uiDrawFillModeWinding,
-	uiDrawFillModeAlternate,
-};
-
-struct uiDrawMatrix {
-	double M11;
-	double M12;
-	double M21;
-	double M22;
-	double M31;
-	double M32;
-};
-
-struct uiDrawBrush {
-	uiDrawBrushType Type;
-
-	// solid brushes
-	double R;
-	double G;
-	double B;
-	double A;
-
-	// gradient brushes
-	double X0;		// linear: start X, radial: start X
-	double Y0;		// linear: start Y, radial: start Y
-	double X1;		// linear: end X, radial: outer circle center X
-	double Y1;		// linear: end Y, radial: outer circle center Y
-	double OuterRadius;		// radial gradients only
-	uiDrawBrushGradientStop *Stops;
-	size_t NumStops;
-	// TODO extend mode
-	// cairo: none, repeat, reflect, pad; no individual control
-	// Direct2D: repeat, reflect, pad; no individual control
-	// Core Graphics: none, pad; before and after individually
-	// TODO cairo documentation is inconsistent about pad
-
-	// TODO images
-
-	// TODO transforms
-};
-
-struct uiDrawBrushGradientStop {
-	double Pos;
-	double R;
-	double G;
-	double B;
-	double A;
-};
-
-struct uiDrawStrokeParams {
-	uiDrawLineCap Cap;
-	uiDrawLineJoin Join;
-	// TODO what if this is 0? on windows there will be a crash with dashing
-	double Thickness;
-	double MiterLimit;
-	double *Dashes;
-	// TOOD what if this is 1 on Direct2D?
-	// TODO what if a dash is 0 on Cairo or Quartz?
-	size_t NumDashes;
-	double DashPhase;
-};
-
-struct uiRect {
-    int X;
-    int Y;
-    int Width;
-    int Height;
-};
-
-typedef struct uiRect uiRect;
-
-_UI_EXTERN uiDrawPath *uiDrawNewPath(uiDrawFillMode fillMode);
-_UI_EXTERN void uiDrawFreePath(uiDrawPath *p);
-
-_UI_EXTERN void uiDrawPathNewFigure(uiDrawPath *p, double x, double y);
-_UI_EXTERN void uiDrawPathNewFigureWithArc(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative);
-_UI_EXTERN void uiDrawPathLineTo(uiDrawPath *p, double x, double y);
-// notes: angles are both relative to 0 and go counterclockwise
-// TODO is the initial line segment on cairo and OS X a proper join?
-// TODO what if sweep < 0?
-_UI_EXTERN void uiDrawPathArcTo(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative);
-_UI_EXTERN void uiDrawPathBezierTo(uiDrawPath *p, double c1x, double c1y, double c2x, double c2y, double endX, double endY);
-// TODO quadratic bezier
-_UI_EXTERN void uiDrawPathCloseFigure(uiDrawPath *p);
-
-// TODO effect of these when a figure is already started
-_UI_EXTERN void uiDrawPathAddRectangle(uiDrawPath *p, double x, double y, double width, double height);
-
-_UI_EXTERN void uiDrawPathEnd(uiDrawPath *p);
-
-_UI_EXTERN void uiDrawStroke(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b, uiDrawStrokeParams *p);
-_UI_EXTERN void uiDrawFill(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b);
-
-// TODO primitives:
-// - rounded rectangles
-// - elliptical arcs
-// - quadratic bezier curves
-
-_UI_EXTERN void uiDrawMatrixSetIdentity(uiDrawMatrix *m);
-_UI_EXTERN void uiDrawMatrixTranslate(uiDrawMatrix *m, double x, double y);
-_UI_EXTERN void uiDrawMatrixScale(uiDrawMatrix *m, double xCenter, double yCenter, double x, double y);
-_UI_EXTERN void uiDrawMatrixRotate(uiDrawMatrix *m, double x, double y, double amount);
-_UI_EXTERN void uiDrawMatrixSkew(uiDrawMatrix *m, double x, double y, double xamount, double yamount);
-_UI_EXTERN void uiDrawMatrixMultiply(uiDrawMatrix *dest, uiDrawMatrix *src);
-_UI_EXTERN int uiDrawMatrixInvertible(uiDrawMatrix *m);
-_UI_EXTERN int uiDrawMatrixInvert(uiDrawMatrix *m);
-_UI_EXTERN void uiDrawMatrixTransformPoint(uiDrawMatrix *m, double *x, double *y);
-_UI_EXTERN void uiDrawMatrixTransformSize(uiDrawMatrix *m, double *x, double *y);
-
-_UI_EXTERN void uiDrawTransform(uiDrawContext *c, uiDrawMatrix *m);
-
-// TODO add a uiDrawPathStrokeToFill() or something like that
-_UI_EXTERN void uiDrawClip(uiDrawContext *c, uiDrawPath *path);
-
-_UI_EXTERN void uiDrawSave(uiDrawContext *c);
-_UI_EXTERN void uiDrawRestore(uiDrawContext *c);
-
-// bitmap API
-_UI_EXTERN uiDrawBitmap* uiDrawNewBitmap(uiDrawContext* c, int width, int height, int alpha);
-_UI_EXTERN void uiDrawBitmapUpdate(uiDrawBitmap* bmp, const void* data);
-_UI_EXTERN void uiDrawBitmapDraw(uiDrawContext* c, uiDrawBitmap* bmp, uiRect* srcrect, uiRect* dstrect, int filter);
-_UI_EXTERN void uiDrawFreeBitmap(uiDrawBitmap* bmp);
-
-// TODO manage the use of Text, Font, and TextFont, and of the uiDrawText prefix in general
-
-///// TODO reconsider this
-typedef struct uiDrawFontFamilies uiDrawFontFamilies;
-
-_UI_EXTERN uiDrawFontFamilies *uiDrawListFontFamilies(void);
-_UI_EXTERN int uiDrawFontFamiliesNumFamilies(uiDrawFontFamilies *ff);
-_UI_EXTERN char *uiDrawFontFamiliesFamily(uiDrawFontFamilies *ff, int n);
-_UI_EXTERN void uiDrawFreeFontFamilies(uiDrawFontFamilies *ff);
-///// END TODO
-
-typedef struct uiDrawTextLayout uiDrawTextLayout;
-typedef struct uiDrawTextFont uiDrawTextFont;
-typedef struct uiDrawTextFontDescriptor uiDrawTextFontDescriptor;
-typedef struct uiDrawTextFontMetrics uiDrawTextFontMetrics;
-
-_UI_ENUM(uiDrawTextWeight) {
-	uiDrawTextWeightThin,
-	uiDrawTextWeightUltraLight,
-	uiDrawTextWeightLight,
-	uiDrawTextWeightBook,
-	uiDrawTextWeightNormal,
-	uiDrawTextWeightMedium,
-	uiDrawTextWeightSemiBold,
-	uiDrawTextWeightBold,
-	uiDrawTextWeightUltraBold,
-	uiDrawTextWeightHeavy,
-	uiDrawTextWeightUltraHeavy,
-};
-
-_UI_ENUM(uiDrawTextItalic) {
-	uiDrawTextItalicNormal,
-	uiDrawTextItalicOblique,
-	uiDrawTextItalicItalic,
-};
-
-_UI_ENUM(uiDrawTextStretch) {
-	uiDrawTextStretchUltraCondensed,
-	uiDrawTextStretchExtraCondensed,
-	uiDrawTextStretchCondensed,
-	uiDrawTextStretchSemiCondensed,
-	uiDrawTextStretchNormal,
-	uiDrawTextStretchSemiExpanded,
-	uiDrawTextStretchExpanded,
-	uiDrawTextStretchExtraExpanded,
-	uiDrawTextStretchUltraExpanded,
-};
-
-struct uiDrawTextFontDescriptor {
-	const char *Family;
-	double Size;
-	uiDrawTextWeight Weight;
-	uiDrawTextItalic Italic;
-	uiDrawTextStretch Stretch;
-};
-
-struct uiDrawTextFontMetrics {
-	double Ascent;
-	double Descent;
-	double Leading;
-	// TODO do these two mean the same across all platforms?
-	double UnderlinePos;
-	double UnderlineThickness;
-};
-
-_UI_EXTERN uiDrawTextFont *uiDrawLoadClosestFont(const uiDrawTextFontDescriptor *desc);
-_UI_EXTERN void uiDrawFreeTextFont(uiDrawTextFont *font);
-_UI_EXTERN uintptr_t uiDrawTextFontHandle(uiDrawTextFont *font);
-_UI_EXTERN void uiDrawTextFontDescribe(uiDrawTextFont *font, uiDrawTextFontDescriptor *desc);
-// TODO make copy with given attributes methods?
-// TODO yuck this name
-_UI_EXTERN void uiDrawTextFontGetMetrics(uiDrawTextFont *font, uiDrawTextFontMetrics *metrics);
-
-// TODO initial line spacing? and what about leading?
-_UI_EXTERN uiDrawTextLayout *uiDrawNewTextLayout(const char *text, uiDrawTextFont *defaultFont, double width);
-_UI_EXTERN void uiDrawFreeTextLayout(uiDrawTextLayout *layout);
-// TODO get width
-_UI_EXTERN void uiDrawTextLayoutSetWidth(uiDrawTextLayout *layout, double width);
-_UI_EXTERN void uiDrawTextLayoutExtents(uiDrawTextLayout *layout, double *width, double *height);
-
-// and the attributes that you can set on a text layout
-_UI_EXTERN void uiDrawTextLayoutSetColor(uiDrawTextLayout *layout, int startChar, int endChar, double r, double g, double b, double a);
-
-_UI_EXTERN void uiDrawText(uiDrawContext *c, double x, double y, uiDrawTextLayout *layout);
-
-
-// OpenGL support
-
-typedef struct uiGLContext uiGLContext;
-
-_UI_EXTERN uiGLContext *uiAreaGetGLContext(uiArea* a);
-_UI_EXTERN void uiGLMakeContextCurrent(uiGLContext* ctx);
-_UI_EXTERN void uiGLBegin(uiGLContext* ctx);
-_UI_EXTERN void uiGLEnd(uiGLContext* ctx);
-_UI_EXTERN unsigned int uiGLGetVersion(uiGLContext* ctx);
-_UI_EXTERN void *uiGLGetProcAddress(const char* proc);
-_UI_EXTERN int uiGLGetFramebuffer(uiGLContext* ctx);
-_UI_EXTERN float uiGLGetFramebufferScale(uiGLContext* ctx);
-_UI_EXTERN void uiGLSwapBuffers(uiGLContext* ctx);
-_UI_EXTERN void uiGLSetVSync(int sync);
-
-
-_UI_ENUM(uiModifiers) {
-	uiModifierCtrl = 1 << 0,
-	uiModifierAlt = 1 << 1,
-	uiModifierShift = 1 << 2,
-	uiModifierSuper = 1 << 3,
-};
-
-// TODO document drag captures
-struct uiAreaMouseEvent {
-	// TODO document what these mean for scrolling areas
-	double X;
-	double Y;
-
-	// TODO see draw above
-	double AreaWidth;
-	double AreaHeight;
-
-	int Down;
-	int Up;
-
-	int Count;
-
-	uiModifiers Modifiers;
-
-	uint64_t Held1To64;
-};
-
-_UI_ENUM(uiExtKey) {
-	uiExtKeyEscape = 1,
-	uiExtKeyInsert,			// equivalent to "Help" on Apple keyboards
-	uiExtKeyDelete,
-	uiExtKeyHome,
-	uiExtKeyEnd,
-	uiExtKeyPageUp,
-	uiExtKeyPageDown,
-	uiExtKeyUp,
-	uiExtKeyDown,
-	uiExtKeyLeft,
-	uiExtKeyRight,
-	uiExtKeyF1,			// F1..F12 are guaranteed to be consecutive
-	uiExtKeyF2,
-	uiExtKeyF3,
-	uiExtKeyF4,
-	uiExtKeyF5,
-	uiExtKeyF6,
-	uiExtKeyF7,
-	uiExtKeyF8,
-	uiExtKeyF9,
-	uiExtKeyF10,
-	uiExtKeyF11,
-	uiExtKeyF12,
-	uiExtKeyN0,			// numpad keys; independent of Num Lock state
-	uiExtKeyN1,			// N0..N9 are guaranteed to be consecutive
-	uiExtKeyN2,
-	uiExtKeyN3,
-	uiExtKeyN4,
-	uiExtKeyN5,
-	uiExtKeyN6,
-	uiExtKeyN7,
-	uiExtKeyN8,
-	uiExtKeyN9,
-	uiExtKeyNDot,
-	uiExtKeyNEnter,
-	uiExtKeyNAdd,
-	uiExtKeyNSubtract,
-	uiExtKeyNMultiply,
-	uiExtKeyNDivide,
-};
-
-struct uiAreaKeyEvent {
-	char Key;
-	uiExtKey ExtKey;
-	uiModifiers Modifier;
-
-	uiModifiers Modifiers;
-
-	// additional things
-	int Scancode; // bit0-7: scancode, bit8: ext flag
-
-	int Up;
-	int Repeat;
-};
-
-typedef struct uiFontButton uiFontButton;
-#define uiFontButton(this) ((uiFontButton *) (this))
-// TODO document this returns a new font
-_UI_EXTERN uiDrawTextFont *uiFontButtonFont(uiFontButton *b);
-// TOOD SetFont, mechanics
-_UI_EXTERN void uiFontButtonOnChanged(uiFontButton *b, void (*f)(uiFontButton *, void *), void *data);
-_UI_EXTERN uiFontButton *uiNewFontButton(void);
-
-typedef struct uiColorButton uiColorButton;
-#define uiColorButton(this) ((uiColorButton *) (this))
-_UI_EXTERN void uiColorButtonColor(uiColorButton *b, double *r, double *g, double *bl, double *a);
-_UI_EXTERN void uiColorButtonSetColor(uiColorButton *b, double r, double g, double bl, double a);
-_UI_EXTERN void uiColorButtonOnChanged(uiColorButton *b, void (*f)(uiColorButton *, void *), void *data);
-_UI_EXTERN uiColorButton *uiNewColorButton(void);
-
-typedef struct uiForm uiForm;
-#define uiForm(this) ((uiForm *) (this))
-_UI_EXTERN void uiFormAppend(uiForm *f, const char *label, uiControl *c, int stretchy);
-_UI_EXTERN void uiFormDelete(uiForm *f, int index);
-_UI_EXTERN int uiFormPadded(uiForm *f);
-_UI_EXTERN void uiFormSetPadded(uiForm *f, int padded);
-_UI_EXTERN uiForm *uiNewForm(void);
-
-_UI_ENUM(uiAlign) {
-	uiAlignFill,
-	uiAlignStart,
-	uiAlignCenter,
-	uiAlignEnd,
-};
-
-_UI_ENUM(uiAt) {
-	uiAtLeading,
-	uiAtTop,
-	uiAtTrailing,
-	uiAtBottom,
-};
-
-typedef struct uiGrid uiGrid;
-#define uiGrid(this) ((uiGrid *) (this))
-_UI_EXTERN void uiGridAppend(uiGrid *g, uiControl *c, int left, int top, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign);
-_UI_EXTERN void uiGridInsertAt(uiGrid *g, uiControl *c, uiControl *existing, uiAt at, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign);
-_UI_EXTERN int uiGridPadded(uiGrid *g);
-_UI_EXTERN void uiGridSetPadded(uiGrid *g, int padded);
-_UI_EXTERN uiGrid *uiNewGrid(void);
-
-
-// misc.
-
-_UI_EXTERN char* uiKeyName(int scancode);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/libui_sdl/libui/unix/stddialogs.c b/src/libui_sdl/libui/unix/stddialogs.c
deleted file mode 100644
index 10c598d..0000000
--- a/src/libui_sdl/libui/unix/stddialogs.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// 26 june 2015
-#include "uipriv_unix.h"
-
-// LONGTERM figure out why, and describe, that this is the desired behavior
-// LONGTERM also point out that font and color buttons also work like this
-
-#define windowWindow(w) ((w)?(GTK_WINDOW(uiControlHandle(uiControl(w)))):NULL)
-
-static char *filedialog(GtkWindow *parent, GtkFileChooserAction mode, const gchar *confirm, const char* filter, const char* initpath)
-{
-	GtkWidget *fcd;
-	GtkFileChooser *fc;
-	gint response;
-	char *filename;
-
-	fcd = gtk_file_chooser_dialog_new(NULL, parent, mode,
-		"_Cancel", GTK_RESPONSE_CANCEL,
-		confirm, GTK_RESPONSE_ACCEPT,
-		NULL);
-	fc = GTK_FILE_CHOOSER(fcd);
-	
-	// filters
-	{
-		gchar _filter[256];
-        gchar* fp = &_filter[0]; int s = 0;
-        gchar* fname;
-        for (int i = 0; i < 255; i++)
-        {
-            if (filter[i] == '|' || filter[i] == '\0')
-            {
-                _filter[i] = '\0';
-                if (s & 1)
-                {
-					GtkFileFilter* filter = gtk_file_filter_new();
-					gtk_file_filter_set_name(filter, fname);
-					
-					for (gchar* j = fp; ; j++)
-					{
-						if (*j == ';')
-						{
-						    *j = '\0';
-							gtk_file_filter_add_pattern(filter, fp);
-							fp = j+1;
-						}
-						else if (*j == '\0')
-						{
-							gtk_file_filter_add_pattern(filter, fp);
-							break;
-						}
-					}
-
-					gtk_file_chooser_add_filter(fc, filter);
-                }
-                else
-                {
-                    fname = fp;
-                }
-                fp = &_filter[i+1];
-                s++;
-                if (s >= 8) break;
-                if (filter[i] == '\0') break;
-            }
-            else
-                _filter[i] = filter[i];
-        }
-	}
-	
-	gtk_file_chooser_set_local_only(fc, FALSE);
-	gtk_file_chooser_set_select_multiple(fc, FALSE);
-	gtk_file_chooser_set_show_hidden(fc, TRUE);
-	gtk_file_chooser_set_do_overwrite_confirmation(fc, TRUE);
-	gtk_file_chooser_set_create_folders(fc, TRUE);
-	if (initpath && strlen(initpath)>0) 
-	    gtk_file_chooser_set_current_folder(fc, initpath);
-	
-	response = gtk_dialog_run(GTK_DIALOG(fcd));
-	if (response != GTK_RESPONSE_ACCEPT) {
-		gtk_widget_destroy(fcd);
-		return NULL;
-	}
-	filename = uiUnixStrdupText(gtk_file_chooser_get_filename(fc));
-	gtk_widget_destroy(fcd);
-	return filename;
-}
-
-char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_OPEN, "_Open", filter, initpath);
-}
-
-char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_SAVE, "_Save", filter, initpath);
-}
-
-static int msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons)
-{
-	GtkWidget *md;
-
-	md = gtk_message_dialog_new(parent, GTK_DIALOG_MODAL,
-		type, buttons,
-		"%s", title);
-	gtk_message_dialog_format_secondary_text(GTK_MESSAGE_DIALOG(md), "%s", description);
-	int result = gtk_dialog_run(GTK_DIALOG(md));
-	gtk_widget_destroy(md);
-
-	return result;
-}
-
-void uiMsgBox(uiWindow *parent, const char *title, const char *description)
-{
-	msgbox(windowWindow(parent), title, description, GTK_MESSAGE_OTHER, GTK_BUTTONS_OK);
-}
-
-void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
-{
-	msgbox(windowWindow(parent), title, description, GTK_MESSAGE_ERROR, GTK_BUTTONS_OK);
-}
-
-int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
-{
-	int result =
-		msgbox(windowWindow(parent), title, description, GTK_MESSAGE_QUESTION, GTK_BUTTONS_OK_CANCEL);
-
-	return result == GTK_RESPONSE_OK;
-}
\ No newline at end of file
diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp
deleted file mode 100644
index 7537015..0000000
--- a/src/libui_sdl/libui/windows/stddialogs.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-// 22 may 2015
-#include "uipriv_windows.hpp"
-
-// TODO document all this is what we want
-// TODO do the same for font and color buttons
-
-// notes:
-// - FOS_SUPPORTSTREAMABLEITEMS doesn't seem to be supported on windows vista, or at least not with the flags we use
-// - even with FOS_NOVALIDATE the dialogs will reject invalid filenames (at least on Vista, anyway)
-// - lack of FOS_NOREADONLYRETURN doesn't seem to matter on Windows 7
-
-// TODO
-// - http://blogs.msdn.com/b/wpfsdk/archive/2006/10/26/uncommon-dialogs--font-chooser-and-color-picker-dialogs.aspx
-// - when a dialog is active, tab navigation in other windows stops working
-// - when adding uiOpenFolder(), use IFileDialog as well - https://msdn.microsoft.com/en-us/library/windows/desktop/bb762115%28v=vs.85%29.aspx
-
-#define windowHWND(w) (w ? (HWND) uiControlHandle(uiControl(w)) : NULL)
-
-char *commonItemDialog(HWND parent, REFCLSID clsid, REFIID iid, const char* filter, const char* initpath, FILEOPENDIALOGOPTIONS optsadd)
-{
-	IFileDialog *d = NULL;
-	FILEOPENDIALOGOPTIONS opts;
-	IShellItem *result = NULL;
-	WCHAR *wname = NULL;
-	char *name = NULL;
-	HRESULT hr;
-
-	hr = CoCreateInstance(clsid,
-		NULL, CLSCTX_INPROC_SERVER,
-		iid, (LPVOID *) (&d));
-	if (hr != S_OK) {
-		logHRESULT(L"error creating common item dialog", hr);
-		// always return NULL on error
-		goto out;
-	}
-	hr = d->GetOptions(&opts);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting current options", hr);
-		goto out;
-	}
-	opts |= optsadd;
-	// the other platforms don't check read-only; we won't either
-	opts &= ~FOS_NOREADONLYRETURN;
-	hr = d->SetOptions(opts);
-	if (hr != S_OK) {
-		logHRESULT(L"error setting options", hr);
-		goto out;
-	}
-
-	// filters
-	{
-        COMDLG_FILTERSPEC filterspec[8];
-        wchar_t _filter[256];
-        wchar_t* fp = &_filter[0]; int s = 0;
-        wchar_t* fname;
-        for (int i = 0; i < 255; i++)
-        {
-            if (filter[i] == '|' || filter[i] == '\0')
-            {
-                _filter[i] = '\0';
-                if (s & 1)
-                {
-                    filterspec[s>>1].pszName = fname;
-                    filterspec[s>>1].pszSpec = fp;
-                }
-                else
-                {
-                    fname = fp;
-                }
-                fp = &_filter[i+1];
-                s++;
-                if (s >= 8) break;
-                if (filter[i] == '\0') break;
-            }
-            else
-                _filter[i] = filter[i];
-        }
-        d->SetFileTypes(s>>1, filterspec);
-	}
-
-	hr = d->Show(parent);
-	if (hr == HRESULT_FROM_WIN32(ERROR_CANCELLED))
-		// cancelled; return NULL like we have ready
-		goto out;
-	if (hr != S_OK) {
-		logHRESULT(L"error showing dialog", hr);
-		goto out;
-	}
-	hr = d->GetResult(&result);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting dialog result", hr);
-		goto out;
-	}
-	hr = result->GetDisplayName(SIGDN_FILESYSPATH, &wname);
-	if (hr != S_OK) {
-		logHRESULT(L"error getting filename", hr);
-		goto out;
-	}
-	name = toUTF8(wname);
-
-out:
-	if (wname != NULL)
-		CoTaskMemFree(wname);
-	if (result != NULL)
-		result->Release();
-	if (d != NULL)
-		d->Release();
-	return name;
-}
-
-char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	char *res;
-
-	disableAllWindowsExcept(parent);
-	res = commonItemDialog(windowHWND(parent),
-		CLSID_FileOpenDialog, IID_IFileOpenDialog,
-		filter, initpath,
-		FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_PATHMUSTEXIST | FOS_FILEMUSTEXIST | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE);
-	enableAllWindowsExcept(parent);
-	return res;
-}
-
-char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath)
-{
-	char *res;
-
-	disableAllWindowsExcept(parent);
-	res = commonItemDialog(windowHWND(parent),
-		CLSID_FileSaveDialog, IID_IFileSaveDialog,
-		filter, initpath,
-		FOS_OVERWRITEPROMPT | FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE);
-	enableAllWindowsExcept(parent);
-	return res;
-}
-
-// TODO switch to TaskDialogIndirect()?
-
-static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon)
-{
-	WCHAR *wtitle, *wdescription;
-	HRESULT hr;
-
-	wtitle = toUTF16(title);
-	wdescription = toUTF16(description);
-
-	int result;
-	hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result);
-	if (hr != S_OK)
-		logHRESULT(L"error showing task dialog", hr);
-
-	uiFree(wdescription);
-	uiFree(wtitle);
-
-	return result;
-}
-
-void uiMsgBox(uiWindow *parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, NULL);
-	enableAllWindowsExcept(parent);
-}
-
-void uiMsgBoxError(uiWindow *parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON);
-	enableAllWindowsExcept(parent);
-}
-
-int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description)
-{
-	disableAllWindowsExcept(parent);
-	int result =
-		msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON);
-	enableAllWindowsExcept(parent);
-
-	return result == IDOK;
-}
\ No newline at end of file
diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp
deleted file mode 100644
index 0066668..0000000
--- a/src/libui_sdl/main.cpp
+++ /dev/null
@@ -1,3061 +0,0 @@
-/*
-    Copyright 2016-2020 Arisotura
-
-    This file is part of melonDS.
-
-    melonDS is free software: you can redistribute it and/or modify it under
-    the terms of the GNU General Public License as published by the Free
-    Software Foundation, either version 3 of the License, or (at your option)
-    any later version.
-
-    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
-    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with melonDS. If not, see http://www.gnu.org/licenses/.
-*/
-
-#include <stdlib.h>
-#include <time.h>
-#include <stdio.h>
-#include <string.h>
-
-#ifndef __WIN32__
-#include <glib.h>
-#endif
-
-#include <SDL2/SDL.h>
-#include "libui/ui.h"
-
-#include "../OpenGLSupport.h"
-#include "main_shaders.h"
-
-#include "../types.h"
-#include "../version.h"
-#include "PlatformConfig.h"
-
-#include "DlgEmuSettings.h"
-#include "DlgInputConfig.h"
-#include "DlgVideoSettings.h"
-#include "DlgAudioSettings.h"
-#include "DlgWifiSettings.h"
-
-#include "../NDS.h"
-#include "../GBACart.h"
-#include "../GPU.h"
-#include "../SPU.h"
-#include "../Wifi.h"
-#include "../Platform.h"
-#include "../Config.h"
-#include "../ARMJIT.h"
-
-#include "../Savestate.h"
-
-#include "OSD.h"
-
-#ifdef MELONCAP
-#include "MelonCap.h"
-#endif // MELONCAP
-
-
-// savestate slot mapping
-// 1-8: regular slots (quick access)
-// '9': load/save arbitrary file
-const int kSavestateNum[9] = {1, 2, 3, 4, 5, 6, 7, 8, 0};
-
-const int kScreenSize[4] = {1, 2, 3, 4};
-const int kScreenRot[4] = {0, 1, 2, 3};
-const int kScreenGap[6] = {0, 1, 8, 64, 90, 128};
-const int kScreenLayout[3] = {0, 1, 2};
-const int kScreenSizing[4] = {0, 1, 2, 3};
-
-
-char* EmuDirectory;
-
-
-uiWindow* MainWindow;
-uiArea* MainDrawArea;
-uiAreaHandler MainDrawAreaHandler;
-
-const u32 kGLVersions[] = {uiGLVersion(3,2), uiGLVersion(3,1), 0};
-uiGLContext* GLContext;
-
-int WindowWidth, WindowHeight;
-
-uiMenuItem* MenuItem_SaveState;
-uiMenuItem* MenuItem_LoadState;
-uiMenuItem* MenuItem_UndoStateLoad;
-
-uiMenuItem* MenuItem_SaveStateSlot[9];
-uiMenuItem* MenuItem_LoadStateSlot[9];
-
-uiMenuItem* MenuItem_Pause;
-uiMenuItem* MenuItem_Reset;
-uiMenuItem* MenuItem_Stop;
-
-uiMenuItem* MenuItem_SavestateSRAMReloc;
-
-uiMenuItem* MenuItem_ScreenRot[4];
-uiMenuItem* MenuItem_ScreenGap[6];
-uiMenuItem* MenuItem_ScreenLayout[3];
-uiMenuItem* MenuItem_ScreenSizing[4];
-
-uiMenuItem* MenuItem_ScreenFilter;
-uiMenuItem* MenuItem_LimitFPS;
-uiMenuItem* MenuItem_AudioSync;
-uiMenuItem* MenuItem_ShowOSD;
-
-SDL_Thread* EmuThread;
-int EmuRunning;
-volatile int EmuStatus;
-
-bool RunningSomething;
-char ROMPath[2][1024];
-char SRAMPath[2][1024];
-char PrevSRAMPath[2][1024]; // for savestate 'undo load'
-
-bool SavestateLoaded;
-
-bool Screen_UseGL;
-
-bool ScreenDrawInited = false;
-uiDrawBitmap* ScreenBitmap[2] = {NULL,NULL};
-
-GLuint GL_ScreenShader[3];
-GLuint GL_ScreenShaderAccel[3];
-GLuint GL_ScreenShaderOSD[3];
-struct
-{
-    float uScreenSize[2];
-    u32 u3DScale;
-    u32 uFilterMode;
-
-} GL_ShaderConfig;
-GLuint GL_ShaderConfigUBO;
-GLuint GL_ScreenVertexArrayID, GL_ScreenVertexBufferID;
-float GL_ScreenVertices[2 * 3*2 * 4]; // position/texcoord
-GLuint GL_ScreenTexture;
-bool GL_ScreenSizeDirty;
-
-int GL_3DScale;
-
-bool GL_VSyncStatus;
-
-int ScreenGap = 0;
-int ScreenLayout = 0;
-int ScreenSizing = 0;
-int ScreenRotation = 0;
-
-int MainScreenPos[3];
-int AutoScreenSizing;
-
-uiRect TopScreenRect;
-uiRect BottomScreenRect;
-uiDrawMatrix TopScreenTrans;
-uiDrawMatrix BottomScreenTrans;
-
-bool Touching = false;
-
-u32 KeyInputMask, JoyInputMask;
-u32 KeyHotkeyMask, JoyHotkeyMask;
-u32 HotkeyMask, LastHotkeyMask;
-u32 HotkeyPress, HotkeyRelease;
-
-#define HotkeyDown(hk)     (HotkeyMask & (1<<(hk)))
-#define HotkeyPressed(hk)  (HotkeyPress & (1<<(hk)))
-#define HotkeyReleased(hk) (HotkeyRelease & (1<<(hk)))
-
-bool LidStatus;
-
-int JoystickID;
-SDL_Joystick* Joystick;
-
-int AudioFreq;
-float AudioSampleFrac;
-SDL_AudioDeviceID AudioDevice, MicDevice;
-
-SDL_cond* AudioSync;
-SDL_mutex* AudioSyncLock;
-
-u32 MicBufferLength = 2048;
-s16 MicBuffer[2048];
-u32 MicBufferReadPos, MicBufferWritePos;
-
-u32 MicWavLength;
-s16* MicWavBuffer;
-
-void SetupScreenRects(int width, int height);
-
-void TogglePause(void* blarg);
-void Reset(void* blarg);
-
-void SetupSRAMPath(int slot);
-
-void SaveState(int slot);
-void LoadState(int slot);
-void UndoStateLoad();
-void GetSavestateName(int slot, char* filename, int len);
-
-void CreateMainWindow(bool opengl);
-void DestroyMainWindow();
-void RecreateMainWindow(bool opengl);
-
-
-
-bool GLScreen_InitShader(GLuint* shader, const char* fs)
-{
-    if (!OpenGL_BuildShaderProgram(kScreenVS, fs, shader, "ScreenShader"))
-        return false;
-
-    glBindAttribLocation(shader[2], 0, "vPosition");
-    glBindAttribLocation(shader[2], 1, "vTexcoord");
-    glBindFragDataLocation(shader[2], 0, "oColor");
-
-    if (!OpenGL_LinkShaderProgram(shader))
-        return false;
-
-    GLuint uni_id;
-
-    uni_id = glGetUniformBlockIndex(shader[2], "uConfig");
-    glUniformBlockBinding(shader[2], uni_id, 16);
-
-    glUseProgram(shader[2]);
-    uni_id = glGetUniformLocation(shader[2], "ScreenTex");
-    glUniform1i(uni_id, 0);
-    uni_id = glGetUniformLocation(shader[2], "_3DTex");
-    glUniform1i(uni_id, 1);
-
-    return true;
-}
-
-bool GLScreen_InitOSDShader(GLuint* shader)
-{
-    if (!OpenGL_BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, shader, "ScreenShaderOSD"))
-        return false;
-
-    glBindAttribLocation(shader[2], 0, "vPosition");
-    glBindFragDataLocation(shader[2], 0, "oColor");
-
-    if (!OpenGL_LinkShaderProgram(shader))
-        return false;
-
-    GLuint uni_id;
-
-    uni_id = glGetUniformBlockIndex(shader[2], "uConfig");
-    glUniformBlockBinding(shader[2], uni_id, 16);
-
-    glUseProgram(shader[2]);
-    uni_id = glGetUniformLocation(shader[2], "OSDTex");
-    glUniform1i(uni_id, 0);
-
-    return true;
-}
-
-bool GLScreen_Init()
-{
-    GL_VSyncStatus = Config::ScreenVSync;
-
-    // TODO: consider using epoxy?
-    if (!OpenGL_Init())
-        return false;
-
-    const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string
-    const GLubyte* version = glGetString(GL_VERSION); // version as a string
-    printf("OpenGL: renderer: %s\n", renderer);
-    printf("OpenGL: version: %s\n", version);
-
-    if (!GLScreen_InitShader(GL_ScreenShader, kScreenFS))
-        return false;
-    if (!GLScreen_InitShader(GL_ScreenShaderAccel, kScreenFS_Accel))
-        return false;
-    if (!GLScreen_InitOSDShader(GL_ScreenShaderOSD))
-        return false;
-
-    memset(&GL_ShaderConfig, 0, sizeof(GL_ShaderConfig));
-
-    glGenBuffers(1, &GL_ShaderConfigUBO);
-    glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO);
-    glBufferData(GL_UNIFORM_BUFFER, sizeof(GL_ShaderConfig), &GL_ShaderConfig, GL_STATIC_DRAW);
-    glBindBufferBase(GL_UNIFORM_BUFFER, 16, GL_ShaderConfigUBO);
-
-    glGenBuffers(1, &GL_ScreenVertexBufferID);
-    glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(GL_ScreenVertices), NULL, GL_STATIC_DRAW);
-
-    glGenVertexArrays(1, &GL_ScreenVertexArrayID);
-    glBindVertexArray(GL_ScreenVertexArrayID);
-    glEnableVertexAttribArray(0); // position
-    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(0));
-    glEnableVertexAttribArray(1); // texcoord
-    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(2*4));
-
-    glGenTextures(1, &GL_ScreenTexture);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI, 256*3 + 1, 192*2, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, NULL);
-
-    GL_ScreenSizeDirty = true;
-
-    return true;
-}
-
-void GLScreen_DeInit()
-{
-    glDeleteTextures(1, &GL_ScreenTexture);
-
-    glDeleteVertexArrays(1, &GL_ScreenVertexArrayID);
-    glDeleteBuffers(1, &GL_ScreenVertexBufferID);
-
-    OpenGL_DeleteShaderProgram(GL_ScreenShader);
-    OpenGL_DeleteShaderProgram(GL_ScreenShaderAccel);
-    OpenGL_DeleteShaderProgram(GL_ScreenShaderOSD);
-}
-
-void GLScreen_DrawScreen()
-{
-    bool vsync = Config::ScreenVSync && !HotkeyDown(HK_FastForward);
-    if (vsync != GL_VSyncStatus)
-    {
-        GL_VSyncStatus = vsync;
-        uiGLSetVSync(vsync);
-    }
-
-    float scale = uiGLGetFramebufferScale(GLContext);
-
-    glBindFramebuffer(GL_FRAMEBUFFER, uiGLGetFramebuffer(GLContext));
-
-    if (GL_ScreenSizeDirty)
-    {
-        GL_ScreenSizeDirty = false;
-
-        GL_ShaderConfig.uScreenSize[0] = WindowWidth;
-        GL_ShaderConfig.uScreenSize[1] = WindowHeight;
-        GL_ShaderConfig.u3DScale = GL_3DScale;
-
-        glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO);
-        void* unibuf = glMapBuffer(GL_UNIFORM_BUFFER, GL_WRITE_ONLY);
-        if (unibuf) memcpy(unibuf, &GL_ShaderConfig, sizeof(GL_ShaderConfig));
-        glUnmapBuffer(GL_UNIFORM_BUFFER);
-
-        float scwidth, scheight;
-
-        float x0, y0, x1, y1;
-        float s0, s1, s2, s3;
-        float t0, t1, t2, t3;
-
-#define SETVERTEX(i, x, y, s, t) \
-    GL_ScreenVertices[4*(i) + 0] = x; \
-    GL_ScreenVertices[4*(i) + 1] = y; \
-    GL_ScreenVertices[4*(i) + 2] = s; \
-    GL_ScreenVertices[4*(i) + 3] = t;
-
-        x0 = TopScreenRect.X;
-        y0 = TopScreenRect.Y;
-        x1 = TopScreenRect.X + TopScreenRect.Width;
-        y1 = TopScreenRect.Y + TopScreenRect.Height;
-
-        scwidth = 256;
-        scheight = 192;
-
-        switch (ScreenRotation)
-        {
-        case 0:
-            s0 = 0; t0 = 0;
-            s1 = scwidth; t1 = 0;
-            s2 = 0; t2 = scheight;
-            s3 = scwidth; t3 = scheight;
-            break;
-
-        case 1:
-            s0 = 0; t0 = scheight;
-            s1 = 0; t1 = 0;
-            s2 = scwidth; t2 = scheight;
-            s3 = scwidth; t3 = 0;
-            break;
-
-        case 2:
-            s0 = scwidth; t0 = scheight;
-            s1 = 0; t1 = scheight;
-            s2 = scwidth; t2 = 0;
-            s3 = 0; t3 = 0;
-            break;
-
-        case 3:
-            s0 = scwidth; t0 = 0;
-            s1 = scwidth; t1 = scheight;
-            s2 = 0; t2 = 0;
-            s3 = 0; t3 = scheight;
-            break;
-        }
-
-        SETVERTEX(0, x0, y0, s0, t0);
-        SETVERTEX(1, x1, y1, s3, t3);
-        SETVERTEX(2, x1, y0, s1, t1);
-        SETVERTEX(3, x0, y0, s0, t0);
-        SETVERTEX(4, x0, y1, s2, t2);
-        SETVERTEX(5, x1, y1, s3, t3);
-
-        x0 = BottomScreenRect.X;
-        y0 = BottomScreenRect.Y;
-        x1 = BottomScreenRect.X + BottomScreenRect.Width;
-        y1 = BottomScreenRect.Y + BottomScreenRect.Height;
-
-        scwidth = 256;
-        scheight = 192;
-
-        switch (ScreenRotation)
-        {
-        case 0:
-            s0 = 0; t0 = 192;
-            s1 = scwidth; t1 = 192;
-            s2 = 0; t2 = 192+scheight;
-            s3 = scwidth; t3 = 192+scheight;
-            break;
-
-        case 1:
-            s0 = 0; t0 = 192+scheight;
-            s1 = 0; t1 = 192;
-            s2 = scwidth; t2 = 192+scheight;
-            s3 = scwidth; t3 = 192;
-            break;
-
-        case 2:
-            s0 = scwidth; t0 = 192+scheight;
-            s1 = 0; t1 = 192+scheight;
-            s2 = scwidth; t2 = 192;
-            s3 = 0; t3 = 192;
-            break;
-
-        case 3:
-            s0 = scwidth; t0 = 192;
-            s1 = scwidth; t1 = 192+scheight;
-            s2 = 0; t2 = 192;
-            s3 = 0; t3 = 192+scheight;
-            break;
-        }
-
-        SETVERTEX(6, x0, y0, s0, t0);
-        SETVERTEX(7, x1, y1, s3, t3);
-        SETVERTEX(8, x1, y0, s1, t1);
-        SETVERTEX(9, x0, y0, s0, t0);
-        SETVERTEX(10, x0, y1, s2, t2);
-        SETVERTEX(11, x1, y1, s3, t3);
-
-#undef SETVERTEX
-
-        glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-        glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(GL_ScreenVertices), GL_ScreenVertices);
-    }
-
-    glDisable(GL_DEPTH_TEST);
-    glDisable(GL_STENCIL_TEST);
-    glDisable(GL_BLEND);
-    glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
-
-    glViewport(0, 0, WindowWidth*scale, WindowHeight*scale);
-
-    if (GPU3D::Renderer == 0)
-        OpenGL_UseShaderProgram(GL_ScreenShader);
-    else
-        OpenGL_UseShaderProgram(GL_ScreenShaderAccel);
-
-    glClearColor(0, 0, 0, 1);
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    if (RunningSomething)
-    {
-        int frontbuf = GPU::FrontBuffer;
-        glActiveTexture(GL_TEXTURE0);
-        glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture);
-
-        if (GPU::Framebuffer[frontbuf][0] && GPU::Framebuffer[frontbuf][1])
-        {
-            if (GPU3D::Renderer == 0)
-            {
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]);
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]);
-            }
-            else
-            {
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]);
-                glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                                GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]);
-            }
-        }
-
-        glActiveTexture(GL_TEXTURE1);
-        if (GPU3D::Renderer != 0)
-            GPU3D::GLRenderer::SetupAccelFrame();
-
-        glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID);
-        glBindVertexArray(GL_ScreenVertexArrayID);
-        glDrawArrays(GL_TRIANGLES, 0, 4*3);
-    }
-
-    OpenGL_UseShaderProgram(GL_ScreenShaderOSD);
-    OSD::Update(true, NULL);
-
-    glFlush();
-    uiGLSwapBuffers(GLContext);
-}
-
-void MicLoadWav(char* name)
-{
-    SDL_AudioSpec format;
-    memset(&format, 0, sizeof(SDL_AudioSpec));
-
-    if (MicWavBuffer) delete[] MicWavBuffer;
-    MicWavBuffer = NULL;
-    MicWavLength = 0;
-
-    u8* buf;
-    u32 len;
-    if (!SDL_LoadWAV(name, &format, &buf, &len))
-        return;
-
-    const u64 dstfreq = 44100;
-
-    if (format.format == AUDIO_S16 || format.format == AUDIO_U16)
-    {
-        int srcinc = format.channels;
-        len /= (2 * srcinc);
-
-        MicWavLength = (len * dstfreq) / format.freq;
-        if (MicWavLength < 735) MicWavLength = 735;
-        MicWavBuffer = new s16[MicWavLength];
-
-        float res_incr = len / (float)MicWavLength;
-        float res_timer = 0;
-        int res_pos = 0;
-
-        for (int i = 0; i < MicWavLength; i++)
-        {
-            u16 val = ((u16*)buf)[res_pos];
-            if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000;
-
-            MicWavBuffer[i] = val;
-
-            res_timer += res_incr;
-            while (res_timer >= 1.0)
-            {
-                res_timer -= 1.0;
-                res_pos += srcinc;
-            }
-        }
-    }
-    else if (format.format == AUDIO_S8 || format.format == AUDIO_U8)
-    {
-        int srcinc = format.channels;
-        len /= srcinc;
-
-        MicWavLength = (len * dstfreq) / format.freq;
-        if (MicWavLength < 735) MicWavLength = 735;
-        MicWavBuffer = new s16[MicWavLength];
-
-        float res_incr = len / (float)MicWavLength;
-        float res_timer = 0;
-        int res_pos = 0;
-
-        for (int i = 0; i < MicWavLength; i++)
-        {
-            u16 val = buf[res_pos] << 8;
-            if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000;
-
-            MicWavBuffer[i] = val;
-
-            res_timer += res_incr;
-            while (res_timer >= 1.0)
-            {
-                res_timer -= 1.0;
-                res_pos += srcinc;
-            }
-        }
-    }
-    else
-        printf("bad WAV format %08X\n", format.format);
-
-    SDL_FreeWAV(buf);
-}
-
-void AudioCallback(void* data, Uint8* stream, int len)
-{
-    len /= (sizeof(s16) * 2);
-
-    // resample incoming audio to match the output sample rate
-
-    float f_len_in = (len * 32823.6328125) / (float)AudioFreq;
-    f_len_in += AudioSampleFrac;
-    int len_in = (int)floor(f_len_in);
-    AudioSampleFrac = f_len_in - len_in;
-
-    s16 buf_in[1024*2];
-    s16* buf_out = (s16*)stream;
-
-    int num_in;
-    int num_out = len;
-
-    SDL_LockMutex(AudioSyncLock);
-    num_in = SPU::ReadOutput(buf_in, len_in);
-    SDL_CondSignal(AudioSync);
-    SDL_UnlockMutex(AudioSyncLock);
-
-    if (num_in < 1)
-    {
-        memset(stream, 0, len*sizeof(s16)*2);
-        return;
-    }
-
-    int margin = 6;
-    if (num_in < len_in-margin)
-    {
-        int last = num_in-1;
-        if (last < 0) last = 0;
-
-        for (int i = num_in; i < len_in-margin; i++)
-            ((u32*)buf_in)[i] = ((u32*)buf_in)[last];
-
-        num_in = len_in-margin;
-    }
-
-    float res_incr = num_in / (float)num_out;
-    float res_timer = 0;
-    int res_pos = 0;
-
-    int volume = Config::AudioVolume;
-
-    for (int i = 0; i < len; i++)
-    {
-        buf_out[i*2  ] = (buf_in[res_pos*2  ] * volume) >> 8;
-        buf_out[i*2+1] = (buf_in[res_pos*2+1] * volume) >> 8;
-
-        /*s16 s_l = buf_in[res_pos*2  ];
-        s16 s_r = buf_in[res_pos*2+1];
-
-        float a = res_timer;
-        float b = 1.0 - a;
-        s_l = (s_l * a) + (buf_in[(res_pos-1)*2  ] * b);
-        s_r = (s_r * a) + (buf_in[(res_pos-1)*2+1] * b);
-
-        buf_out[i*2  ] = (s_l * volume) >> 8;
-        buf_out[i*2+1] = (s_r * volume) >> 8;*/
-
-        res_timer += res_incr;
-        while (res_timer >= 1.0)
-        {
-            res_timer -= 1.0;
-            res_pos++;
-        }
-    }
-}
-
-void MicCallback(void* data, Uint8* stream, int len)
-{
-    if (Config::MicInputType != 1) return;
-
-    s16* input = (s16*)stream;
-    len /= sizeof(s16);
-
-    if ((MicBufferWritePos + len) > MicBufferLength)
-    {
-        u32 len1 = MicBufferLength - MicBufferWritePos;
-        memcpy(&MicBuffer[MicBufferWritePos], &input[0], len1*sizeof(s16));
-        memcpy(&MicBuffer[0], &input[len1], (len - len1)*sizeof(s16));
-        MicBufferWritePos = len - len1;
-    }
-    else
-    {
-        memcpy(&MicBuffer[MicBufferWritePos], input, len*sizeof(s16));
-        MicBufferWritePos += len;
-    }
-}
-
-void FeedMicInput()
-{
-    int type = Config::MicInputType;
-    bool cmd = HotkeyDown(HK_Mic);
-
-    if ((type != 1 && !cmd) ||
-        (type == 1 && MicBufferLength == 0) ||
-        (type == 3 && MicWavBuffer == NULL))
-    {
-        type = 0;
-        MicBufferReadPos = 0;
-    }
-
-    switch (type)
-    {
-    case 0: // no mic
-        NDS::MicInputFrame(NULL, 0);
-        break;
-
-    case 1: // host mic
-        if ((MicBufferReadPos + 735) > MicBufferLength)
-        {
-            s16 tmp[735];
-            u32 len1 = MicBufferLength - MicBufferReadPos;
-            memcpy(&tmp[0], &MicBuffer[MicBufferReadPos], len1*sizeof(s16));
-            memcpy(&tmp[len1], &MicBuffer[0], (735 - len1)*sizeof(s16));
-
-            NDS::MicInputFrame(tmp, 735);
-            MicBufferReadPos = 735 - len1;
-        }
-        else
-        {
-            NDS::MicInputFrame(&MicBuffer[MicBufferReadPos], 735);
-            MicBufferReadPos += 735;
-        }
-        break;
-
-    case 2: // white noise
-        {
-            s16 tmp[735];
-            for (int i = 0; i < 735; i++) tmp[i] = rand() & 0xFFFF;
-            NDS::MicInputFrame(tmp, 735);
-        }
-        break;
-
-    case 3: // WAV
-        if ((MicBufferReadPos + 735) > MicWavLength)
-        {
-            s16 tmp[735];
-            u32 len1 = MicWavLength - MicBufferReadPos;
-            memcpy(&tmp[0], &MicWavBuffer[MicBufferReadPos], len1*sizeof(s16));
-            memcpy(&tmp[len1], &MicWavBuffer[0], (735 - len1)*sizeof(s16));
-
-            NDS::MicInputFrame(tmp, 735);
-            MicBufferReadPos = 735 - len1;
-        }
-        else
-        {
-            NDS::MicInputFrame(&MicWavBuffer[MicBufferReadPos], 735);
-            MicBufferReadPos += 735;
-        }
-        break;
-    }
-}
-
-void OpenJoystick()
-{
-    if (Joystick) SDL_JoystickClose(Joystick);
-
-    int num = SDL_NumJoysticks();
-    if (num < 1)
-    {
-        Joystick = NULL;
-        return;
-    }
-
-    if (JoystickID >= num)
-        JoystickID = 0;
-
-    Joystick = SDL_JoystickOpen(JoystickID);
-}
-
-bool JoystickButtonDown(int val)
-{
-    if (val == -1) return false;
-
-    bool hasbtn = ((val & 0xFFFF) != 0xFFFF);
-
-    if (hasbtn)
-    {
-        if (val & 0x100)
-        {
-            int hatnum = (val >> 4) & 0xF;
-            int hatdir = val & 0xF;
-            Uint8 hatval = SDL_JoystickGetHat(Joystick, hatnum);
-
-            bool pressed = false;
-            if      (hatdir == 0x1) pressed = (hatval & SDL_HAT_UP);
-            else if (hatdir == 0x4) pressed = (hatval & SDL_HAT_DOWN);
-            else if (hatdir == 0x2) pressed = (hatval & SDL_HAT_RIGHT);
-            else if (hatdir == 0x8) pressed = (hatval & SDL_HAT_LEFT);
-
-            if (pressed) return true;
-        }
-        else
-        {
-            int btnnum = val & 0xFFFF;
-            Uint8 btnval = SDL_JoystickGetButton(Joystick, btnnum);
-
-            if (btnval) return true;
-        }
-    }
-
-    if (val & 0x10000)
-    {
-        int axisnum = (val >> 24) & 0xF;
-        int axisdir = (val >> 20) & 0xF;
-        Sint16 axisval = SDL_JoystickGetAxis(Joystick, axisnum);
-
-        switch (axisdir)
-        {
-        case 0: // positive
-            if (axisval > 16384) return true;
-            break;
-
-        case 1: // negative
-            if (axisval < -16384) return true;
-            break;
-
-        case 2: // trigger
-            if (axisval > 0) return true;
-            break;
-        }
-    }
-
-    return false;
-}
-
-void ProcessInput()
-{
-    SDL_JoystickUpdate();
-
-    if (Joystick)
-    {
-        if (!SDL_JoystickGetAttached(Joystick))
-        {
-            SDL_JoystickClose(Joystick);
-            Joystick = NULL;
-        }
-    }
-    if (!Joystick && (SDL_NumJoysticks() > 0))
-    {
-        JoystickID = Config::JoystickID;
-        OpenJoystick();
-    }
-
-    JoyInputMask = 0xFFF;
-    for (int i = 0; i < 12; i++)
-        if (JoystickButtonDown(Config::JoyMapping[i]))
-            JoyInputMask &= ~(1<<i);
-
-    JoyHotkeyMask = 0;
-    for (int i = 0; i < HK_MAX; i++)
-        if (JoystickButtonDown(Config::HKJoyMapping[i]))
-            JoyHotkeyMask |= (1<<i);
-
-    HotkeyMask = KeyHotkeyMask | JoyHotkeyMask;
-    HotkeyPress = HotkeyMask & ~LastHotkeyMask;
-    HotkeyRelease = LastHotkeyMask & ~HotkeyMask;
-    LastHotkeyMask = HotkeyMask;
-}
-
-bool JoyButtonPressed(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat)
-{
-    if (btnid < 0) return false;
-
-    hat &= ~(hat >> 4);
-
-    bool pressed = false;
-    if (btnid == 0x101) // up
-        pressed = (hat & SDL_HAT_UP);
-    else if (btnid == 0x104) // down
-        pressed = (hat & SDL_HAT_DOWN);
-    else if (btnid == 0x102) // right
-        pressed = (hat & SDL_HAT_RIGHT);
-    else if (btnid == 0x108) // left
-        pressed = (hat & SDL_HAT_LEFT);
-    else if (btnid < njoybuttons)
-        pressed = (joybuttons[btnid] & ~(joybuttons[btnid] >> 1)) & 0x01;
-
-    return pressed;
-}
-
-bool JoyButtonHeld(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat)
-{
-    if (btnid < 0) return false;
-
-    bool pressed = false;
-    if (btnid == 0x101) // up
-        pressed = (hat & SDL_HAT_UP);
-    else if (btnid == 0x104) // down
-        pressed = (hat & SDL_HAT_DOWN);
-    else if (btnid == 0x102) // right
-        pressed = (hat & SDL_HAT_RIGHT);
-    else if (btnid == 0x108) // left
-        pressed = (hat & SDL_HAT_LEFT);
-    else if (btnid < njoybuttons)
-        pressed = joybuttons[btnid] & 0x01;
-
-    return pressed;
-}
-
-void UpdateWindowTitle(void* data)
-{
-    if (EmuStatus == 0) return;
-    void** dataarray = (void**)data;
-    SDL_LockMutex((SDL_mutex*)dataarray[1]);
-    uiWindowSetTitle(MainWindow, (const char*)dataarray[0]);
-    SDL_UnlockMutex((SDL_mutex*)dataarray[1]);
-}
-
-void UpdateFPSLimit(void* data)
-{
-    uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1);
-}
-
-int EmuThreadFunc(void* burp)
-{
-    NDS::Init();
-
-    MainScreenPos[0] = 0;
-    MainScreenPos[1] = 0;
-    MainScreenPos[2] = 0;
-    AutoScreenSizing = 0;
-
-    if (Screen_UseGL)
-    {
-        uiGLMakeContextCurrent(GLContext);
-        GPU3D::InitRenderer(true);
-        uiGLMakeContextCurrent(NULL);
-    }
-    else
-    {
-        GPU3D::InitRenderer(false);
-    }
-
-    Touching = false;
-    KeyInputMask = 0xFFF;
-    JoyInputMask = 0xFFF;
-    KeyHotkeyMask = 0;
-    JoyHotkeyMask = 0;
-    HotkeyMask = 0;
-    LastHotkeyMask = 0;
-    LidStatus = false;
-
-    u32 nframes = 0;
-    u32 starttick = SDL_GetTicks();
-    u32 lasttick = starttick;
-    u32 lastmeasuretick = lasttick;
-    u32 fpslimitcount = 0;
-    u64 perfcount = SDL_GetPerformanceCounter();
-    u64 perffreq = SDL_GetPerformanceFrequency();
-    float samplesleft = 0;
-    u32 nsamples = 0;
-
-    char melontitle[100];
-    SDL_mutex* titlemutex = SDL_CreateMutex();
-    void* titledata[2] = {melontitle, titlemutex};
-
-    while (EmuRunning != 0)
-    {
-        ProcessInput();
-
-        if (HotkeyPressed(HK_FastForwardToggle))
-        {
-            Config::LimitFPS = !Config::LimitFPS;
-            uiQueueMain(UpdateFPSLimit, NULL);
-        }
-        // TODO: similar hotkeys for video/audio sync?
-
-        if (HotkeyPressed(HK_Pause)) uiQueueMain(TogglePause, NULL);
-        if (HotkeyPressed(HK_Reset)) uiQueueMain(Reset, NULL);
-
-        if (GBACart::CartInserted && GBACart::HasSolarSensor)
-        {
-            if (HotkeyPressed(HK_SolarSensorDecrease))
-            {
-                if (GBACart_SolarSensor::LightLevel > 0) GBACart_SolarSensor::LightLevel--;
-                char msg[64];
-                sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel);
-                OSD::AddMessage(0, msg);
-            }
-            if (HotkeyPressed(HK_SolarSensorIncrease))
-            {
-                if (GBACart_SolarSensor::LightLevel < 10) GBACart_SolarSensor::LightLevel++;
-                char msg[64];
-                sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel);
-                OSD::AddMessage(0, msg);
-            }
-        }
-
-        if (EmuRunning == 1)
-        {
-            EmuStatus = 1;
-
-            // process input and hotkeys
-            NDS::SetKeyMask(KeyInputMask & JoyInputMask);
-
-            if (HotkeyPressed(HK_Lid))
-            {
-                LidStatus = !LidStatus;
-                NDS::SetLidClosed(LidStatus);
-                OSD::AddMessage(0, LidStatus ? "Lid closed" : "Lid opened");
-            }
-
-            // microphone input
-            FeedMicInput();
-
-            if (Screen_UseGL)
-            {
-                uiGLBegin(GLContext);
-                uiGLMakeContextCurrent(GLContext);
-            }
-
-            // auto screen layout
-            {
-                MainScreenPos[2] = MainScreenPos[1];
-                MainScreenPos[1] = MainScreenPos[0];
-                MainScreenPos[0] = NDS::PowerControl9 >> 15;
-
-                int guess;
-                if (MainScreenPos[0] == MainScreenPos[2] &&
-                    MainScreenPos[0] != MainScreenPos[1])
-                {
-                    // constant flickering, likely displaying 3D on both screens
-                    // TODO: when both screens are used for 2D only...???
-                    guess = 0;
-                }
-                else
-                {
-                    if (MainScreenPos[0] == 1)
-                        guess = 1;
-                    else
-                        guess = 2;
-                }
-
-                if (guess != AutoScreenSizing)
-                {
-                    AutoScreenSizing = guess;
-                    SetupScreenRects(WindowWidth, WindowHeight);
-                }
-            }
-
-            // emulate
-            u32 nlines = NDS::RunFrame();
-
-#ifdef MELONCAP
-            MelonCap::Update();
-#endif // MELONCAP
-
-            if (EmuRunning == 0) break;
-
-            if (Screen_UseGL)
-            {
-                GLScreen_DrawScreen();
-                uiGLEnd(GLContext);
-            }
-            uiAreaQueueRedrawAll(MainDrawArea);
-
-            bool fastforward = HotkeyDown(HK_FastForward);
-
-            if (Config::AudioSync && !fastforward)
-            {
-                SDL_LockMutex(AudioSyncLock);
-                while (SPU::GetOutputSize() > 1024)
-                {
-                    int ret = SDL_CondWaitTimeout(AudioSync, AudioSyncLock, 500);
-                    if (ret == SDL_MUTEX_TIMEDOUT) break;
-                }
-                SDL_UnlockMutex(AudioSyncLock);
-            }
-            else
-            {
-                // ensure the audio FIFO doesn't overflow
-                //SPU::TrimOutput();
-            }
-
-            float framerate = (1000.0f * nlines) / (60.0f * 263.0f);
-
-            {
-                u32 curtick = SDL_GetTicks();
-                u32 delay = curtick - lasttick;
-
-                bool limitfps = Config::LimitFPS && !fastforward;
-                if (limitfps)
-                {
-                    float wantedtickF = starttick + (framerate * (fpslimitcount+1));
-                    u32 wantedtick = (u32)ceil(wantedtickF);
-                    if (curtick < wantedtick) SDL_Delay(wantedtick - curtick);
-
-                    lasttick = SDL_GetTicks();
-                    fpslimitcount++;
-                    if ((abs(wantedtickF - (float)wantedtick) < 0.001312) || (fpslimitcount > 60))
-                    {
-                        fpslimitcount = 0;
-                        nsamples = 0;
-                        starttick = lasttick;
-                    }
-                }
-                else
-                {
-                    if (delay < 1) SDL_Delay(1);
-                    lasttick = SDL_GetTicks();
-                }
-            }
-
-            nframes++;
-            if (nframes >= 30)
-            {
-                u32 tick = SDL_GetTicks();
-                u32 diff = tick - lastmeasuretick;
-                lastmeasuretick = tick;
-
-                u32 fps;
-                if (diff < 1) fps = 77777;
-                else fps = (nframes * 1000) / diff;
-                nframes = 0;
-
-                float fpstarget;
-                if (framerate < 1) fpstarget = 999;
-                else fpstarget = 1000.0f/framerate;
-
-                SDL_LockMutex(titlemutex);
-                sprintf(melontitle, "[%d/%.0f] melonDS " MELONDS_VERSION, fps, fpstarget);
-                SDL_UnlockMutex(titlemutex);
-                uiQueueMain(UpdateWindowTitle, titledata);
-            }
-        }
-        else
-        {
-            // paused
-            nframes = 0;
-            lasttick = SDL_GetTicks();
-            starttick = lasttick;
-            lastmeasuretick = lasttick;
-            fpslimitcount = 0;
-
-            if (EmuRunning == 2)
-            {
-                if (Screen_UseGL)
-                {
-                    uiGLBegin(GLContext);
-                    uiGLMakeContextCurrent(GLContext);
-                    GLScreen_DrawScreen();
-                    uiGLEnd(GLContext);
-                }
-                uiAreaQueueRedrawAll(MainDrawArea);
-            }
-
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-            EmuStatus = EmuRunning;
-
-            SDL_Delay(100);
-        }
-    }
-
-    EmuStatus = 0;
-
-    SDL_DestroyMutex(titlemutex);
-
-    if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-
-    NDS::DeInit();
-    Platform::LAN_DeInit();
-
-    if (Screen_UseGL)
-    {
-        OSD::DeInit(true);
-        GLScreen_DeInit();
-    }
-    else
-        OSD::DeInit(false);
-
-    if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-    return 44203;
-}
-
-void StopEmuThread()
-{
-    EmuRunning = 0;
-    SDL_WaitThread(EmuThread, NULL);
-}
-
-
-void OnAreaDraw(uiAreaHandler* handler, uiArea* area, uiAreaDrawParams* params)
-{
-    if (!ScreenDrawInited)
-    {
-        if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-        if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-        ScreenDrawInited = true;
-        ScreenBitmap[0] = uiDrawNewBitmap(params->Context, 256, 192, 0);
-        ScreenBitmap[1] = uiDrawNewBitmap(params->Context, 256, 192, 0);
-    }
-
-    int frontbuf = GPU::FrontBuffer;
-    if (!ScreenBitmap[0] || !ScreenBitmap[1]) return;
-    if (!GPU::Framebuffer[frontbuf][0] || !GPU::Framebuffer[frontbuf][1]) return;
-
-    uiRect top = {0, 0, 256, 192};
-    uiRect bot = {0, 0, 256, 192};
-
-    uiDrawBitmapUpdate(ScreenBitmap[0], GPU::Framebuffer[frontbuf][0]);
-    uiDrawBitmapUpdate(ScreenBitmap[1], GPU::Framebuffer[frontbuf][1]);
-
-    uiDrawSave(params->Context);
-    uiDrawTransform(params->Context, &TopScreenTrans);
-    uiDrawBitmapDraw(params->Context, ScreenBitmap[0], &top, &TopScreenRect, Config::ScreenFilter==1);
-    uiDrawRestore(params->Context);
-
-    uiDrawSave(params->Context);
-    uiDrawTransform(params->Context, &BottomScreenTrans);
-    uiDrawBitmapDraw(params->Context, ScreenBitmap[1], &bot, &BottomScreenRect, Config::ScreenFilter==1);
-    uiDrawRestore(params->Context);
-
-    OSD::Update(false, params);
-}
-
-void OnAreaMouseEvent(uiAreaHandler* handler, uiArea* area, uiAreaMouseEvent* evt)
-{
-    int x = (int)evt->X;
-    int y = (int)evt->Y;
-
-    if (Touching && (evt->Up == 1))
-    {
-        Touching = false;
-        NDS::ReleaseKey(16+6);
-        NDS::ReleaseScreen();
-    }
-    else if (!Touching && (evt->Down == 1) &&
-             (x >= BottomScreenRect.X) && (y >= BottomScreenRect.Y) &&
-             (x < (BottomScreenRect.X+BottomScreenRect.Width)) && (y < (BottomScreenRect.Y+BottomScreenRect.Height)))
-    {
-        Touching = true;
-        NDS::PressKey(16+6);
-    }
-
-    if (Touching)
-    {
-        x -= BottomScreenRect.X;
-        y -= BottomScreenRect.Y;
-
-        if (ScreenRotation == 0 || ScreenRotation == 2)
-        {
-            if (BottomScreenRect.Width != 256)
-                x = (x * 256) / BottomScreenRect.Width;
-            if (BottomScreenRect.Height != 192)
-                y = (y * 192) / BottomScreenRect.Height;
-
-            if (ScreenRotation == 2)
-            {
-                x = 255 - x;
-                y = 191 - y;
-            }
-        }
-        else
-        {
-            if (BottomScreenRect.Width != 192)
-                x = (x * 192) / BottomScreenRect.Width;
-            if (BottomScreenRect.Height != 256)
-                y = (y * 256) / BottomScreenRect.Height;
-
-            if (ScreenRotation == 1)
-            {
-                int tmp = x;
-                x = y;
-                y = 191 - tmp;
-            }
-            else
-            {
-                int tmp = x;
-                x = 255 - y;
-                y = tmp;
-            }
-        }
-
-        // clamp
-        if (x < 0) x = 0;
-        else if (x > 255) x = 255;
-        if (y < 0) y = 0;
-        else if (y > 191) y = 191;
-
-        // TODO: take advantage of possible extra precision when possible? (scaled window for example)
-        NDS::TouchScreen(x, y);
-    }
-}
-
-void OnAreaMouseCrossed(uiAreaHandler* handler, uiArea* area, int left)
-{
-}
-
-void OnAreaDragBroken(uiAreaHandler* handler, uiArea* area)
-{
-}
-
-bool EventMatchesKey(uiAreaKeyEvent* evt, int val, bool checkmod)
-{
-    if (val == -1) return false;
-
-    int key = val & 0xFFFF;
-    int mod = val >> 16;
-    return evt->Scancode == key && (!checkmod || evt->Modifiers == mod);
-}
-
-int OnAreaKeyEvent(uiAreaHandler* handler, uiArea* area, uiAreaKeyEvent* evt)
-{
-    // TODO: release all keys if the window loses focus? or somehow global key input?
-    if (evt->Scancode == 0x38) // ALT
-        return 0;
-    if (evt->Modifiers == 0x2) // ALT+key
-        return 0;
-
-    if (evt->Up)
-    {
-        for (int i = 0; i < 12; i++)
-            if (EventMatchesKey(evt, Config::KeyMapping[i], false))
-                KeyInputMask |= (1<<i);
-
-        for (int i = 0; i < HK_MAX; i++)
-            if (EventMatchesKey(evt, Config::HKKeyMapping[i], true))
-                KeyHotkeyMask &= ~(1<<i);
-    }
-    else if (!evt->Repeat)
-    {
-        // TODO, eventually: make savestate keys configurable?
-        // F keys: 3B-44, 57-58 | SHIFT: mod. 0x4
-        if (evt->Scancode >= 0x3B && evt->Scancode <= 0x42) // F1-F8, quick savestate
-        {
-            if      (evt->Modifiers == 0x4) SaveState(1 + (evt->Scancode - 0x3B));
-            else if (evt->Modifiers == 0x0) LoadState(1 + (evt->Scancode - 0x3B));
-        }
-        else if (evt->Scancode == 0x43) // F9, savestate from/to file
-        {
-            if      (evt->Modifiers == 0x4) SaveState(0);
-            else if (evt->Modifiers == 0x0) LoadState(0);
-        }
-        else if (evt->Scancode == 0x58) // F12, undo savestate
-        {
-            if (evt->Modifiers == 0x0) UndoStateLoad();
-        }
-
-        for (int i = 0; i < 12; i++)
-            if (EventMatchesKey(evt, Config::KeyMapping[i], false))
-                KeyInputMask &= ~(1<<i);
-
-        for (int i = 0; i < HK_MAX; i++)
-            if (EventMatchesKey(evt, Config::HKKeyMapping[i], true))
-                KeyHotkeyMask |= (1<<i);
-
-        // REMOVE ME
-        //if (evt->Scancode == 0x57) // F11
-        //    NDS::debug(0);
-    }
-
-    return 1;
-}
-
-void SetupScreenRects(int width, int height)
-{
-    bool horizontal = false;
-    bool sideways = false;
-
-    if (ScreenRotation == 1 || ScreenRotation == 3)
-        sideways = true;
-
-    if (ScreenLayout == 2) horizontal = true;
-    else if (ScreenLayout == 0)
-    {
-        if (sideways)
-            horizontal = true;
-    }
-
-    int sizemode;
-    if (ScreenSizing == 3)
-        sizemode = AutoScreenSizing;
-    else
-        sizemode = ScreenSizing;
-
-    int screenW, screenH, gap;
-    if (sideways)
-    {
-        screenW = 192;
-        screenH = 256;
-    }
-    else
-    {
-        screenW = 256;
-        screenH = 192;
-    }
-
-    gap = ScreenGap;
-
-    uiRect *topscreen, *bottomscreen;
-    if (ScreenRotation == 1 || ScreenRotation == 2)
-    {
-        topscreen = &BottomScreenRect;
-        bottomscreen = &TopScreenRect;
-    }
-    else
-    {
-        topscreen = &TopScreenRect;
-        bottomscreen = &BottomScreenRect;
-    }
-
-    if (horizontal)
-    {
-        // side-by-side
-
-        int heightreq;
-        int startX = 0;
-
-        width -= gap;
-
-        if (sizemode == 0) // even
-        {
-            heightreq = (width * screenH) / (screenW*2);
-            if (heightreq > height)
-            {
-                int newwidth = (height * width) / heightreq;
-                startX = (width - newwidth) / 2;
-                heightreq = height;
-                width = newwidth;
-            }
-        }
-        else // emph. top/bottom
-        {
-            heightreq = ((width - screenW) * screenH) / screenW;
-            if (heightreq > height)
-            {
-                int newwidth = ((height * (width - screenW)) / heightreq) + screenW;
-                startX = (width - newwidth) / 2;
-                heightreq = height;
-                width = newwidth;
-            }
-        }
-
-        if (sizemode == 2)
-        {
-            topscreen->Width = screenW;
-            topscreen->Height = screenH;
-        }
-        else
-        {
-            topscreen->Width = (sizemode==0) ? (width / 2) : (width - screenW);
-            topscreen->Height = heightreq;
-        }
-        topscreen->X = startX;
-        topscreen->Y = ((height - heightreq) / 2) + (heightreq - topscreen->Height);
-
-        bottomscreen->X = topscreen->X + topscreen->Width + gap;
-
-        if (sizemode == 1)
-        {
-            bottomscreen->Width = screenW;
-            bottomscreen->Height = screenH;
-        }
-        else
-        {
-            bottomscreen->Width = width - topscreen->Width;
-            bottomscreen->Height = heightreq;
-        }
-        bottomscreen->Y = ((height - heightreq) / 2) + (heightreq - bottomscreen->Height);
-    }
-    else
-    {
-        // top then bottom
-
-        int widthreq;
-        int startY = 0;
-
-        height -= gap;
-
-        if (sizemode == 0) // even
-        {
-            widthreq = (height * screenW) / (screenH*2);
-            if (widthreq > width)
-            {
-                int newheight = (width * height) / widthreq;
-                startY = (height - newheight) / 2;
-                widthreq = width;
-                height = newheight;
-            }
-        }
-        else // emph. top/bottom
-        {
-            widthreq = ((height - screenH) * screenW) / screenH;
-            if (widthreq > width)
-            {
-                int newheight = ((width * (height - screenH)) / widthreq) + screenH;
-                startY = (height - newheight) / 2;
-                widthreq = width;
-                height = newheight;
-            }
-        }
-
-        if (sizemode == 2)
-        {
-            topscreen->Width = screenW;
-            topscreen->Height = screenH;
-        }
-        else
-        {
-            topscreen->Width = widthreq;
-            topscreen->Height = (sizemode==0) ? (height / 2) : (height - screenH);
-        }
-        topscreen->Y = startY;
-        topscreen->X = (width - topscreen->Width) / 2;
-
-        bottomscreen->Y = topscreen->Y + topscreen->Height + gap;
-
-        if (sizemode == 1)
-        {
-            bottomscreen->Width = screenW;
-            bottomscreen->Height = screenH;
-        }
-        else
-        {
-            bottomscreen->Width = widthreq;
-            bottomscreen->Height = height - topscreen->Height;
-        }
-        bottomscreen->X = (width - bottomscreen->Width) / 2;
-    }
-
-    // setup matrices for potential rotation
-
-    uiDrawMatrixSetIdentity(&TopScreenTrans);
-    uiDrawMatrixSetIdentity(&BottomScreenTrans);
-
-    switch (ScreenRotation)
-    {
-    case 1: // 90°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI/2.0f);
-            uiDrawMatrixScale(&TopScreenTrans, 0, 0,
-                              TopScreenRect.Width/(double)TopScreenRect.Height,
-                              TopScreenRect.Height/(double)TopScreenRect.Width);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI/2.0f);
-            uiDrawMatrixScale(&BottomScreenTrans, 0, 0,
-                              BottomScreenRect.Width/(double)BottomScreenRect.Height,
-                              BottomScreenRect.Height/(double)BottomScreenRect.Width);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y);
-        }
-        break;
-
-    case 2: // 180°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y+TopScreenRect.Height);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y+BottomScreenRect.Height);
-        }
-        break;
-
-    case 3: // 270°
-        {
-            uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y);
-            uiDrawMatrixRotate(&TopScreenTrans, 0, 0, -M_PI/2.0f);
-            uiDrawMatrixScale(&TopScreenTrans, 0, 0,
-                              TopScreenRect.Width/(double)TopScreenRect.Height,
-                              TopScreenRect.Height/(double)TopScreenRect.Width);
-            uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X, TopScreenRect.Y+TopScreenRect.Height);
-
-            uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y);
-            uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, -M_PI/2.0f);
-            uiDrawMatrixScale(&BottomScreenTrans, 0, 0,
-                              BottomScreenRect.Width/(double)BottomScreenRect.Height,
-                              BottomScreenRect.Height/(double)BottomScreenRect.Width);
-            uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X, BottomScreenRect.Y+BottomScreenRect.Height);
-        }
-        break;
-    }
-
-    GL_ScreenSizeDirty = true;
-}
-
-void SetMinSize(int w, int h)
-{
-    int cw, ch;
-    uiWindowContentSize(MainWindow, &cw, &ch);
-
-    uiControlSetMinSize(uiControl(MainDrawArea), w, h);
-    if ((cw < w) || (ch < h))
-    {
-        if (cw < w) cw = w;
-        if (ch < h) ch = h;
-        uiWindowSetContentSize(MainWindow, cw, ch);
-    }
-}
-
-void OnAreaResize(uiAreaHandler* handler, uiArea* area, int width, int height)
-{
-    SetupScreenRects(width, height);
-
-    // TODO:
-    // should those be the size of the uiArea, or the size of the window client area?
-    // for now the uiArea fills the whole window anyway
-    // but... we never know, I guess
-    WindowWidth = width;
-    WindowHeight = height;
-
-    int ismax = uiWindowMaximized(MainWindow);
-    int ismin = uiWindowMinimized(MainWindow);
-
-    Config::WindowMaximized = ismax;
-    if (!ismax && !ismin)
-    {
-        Config::WindowWidth = width;
-        Config::WindowHeight = height;
-    }
-
-    OSD::WindowResized(Screen_UseGL);
-}
-
-
-void Run()
-{
-    EmuRunning = 1;
-    RunningSomething = true;
-
-    SPU::InitOutput();
-    AudioSampleFrac = 0;
-    SDL_PauseAudioDevice(AudioDevice, 0);
-    SDL_PauseAudioDevice(MicDevice, 0);
-
-    uiMenuItemEnable(MenuItem_SaveState);
-    uiMenuItemEnable(MenuItem_LoadState);
-
-    if (SavestateLoaded)
-        uiMenuItemEnable(MenuItem_UndoStateLoad);
-    else
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    for (int i = 0; i < 8; i++)
-    {
-        char ssfile[1024];
-        GetSavestateName(i+1, ssfile, 1024);
-        if (Platform::FileExists(ssfile)) uiMenuItemEnable(MenuItem_LoadStateSlot[i]);
-        else                              uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    }
-
-    for (int i = 0; i < 9; i++) uiMenuItemEnable(MenuItem_SaveStateSlot[i]);
-    uiMenuItemEnable(MenuItem_LoadStateSlot[8]);
-
-    uiMenuItemEnable(MenuItem_Pause);
-    uiMenuItemEnable(MenuItem_Reset);
-    uiMenuItemEnable(MenuItem_Stop);
-    uiMenuItemSetChecked(MenuItem_Pause, 0);
-}
-
-void TogglePause(void* blarg)
-{
-    if (!RunningSomething) return;
-
-    if (EmuRunning == 1)
-    {
-        // enable pause
-        EmuRunning = 2;
-        uiMenuItemSetChecked(MenuItem_Pause, 1);
-
-        SPU::DrainOutput();
-        SDL_PauseAudioDevice(AudioDevice, 1);
-        SDL_PauseAudioDevice(MicDevice, 1);
-
-        OSD::AddMessage(0, "Paused");
-    }
-    else
-    {
-        // disable pause
-        EmuRunning = 1;
-        uiMenuItemSetChecked(MenuItem_Pause, 0);
-
-        SPU::InitOutput();
-        AudioSampleFrac = 0;
-        SDL_PauseAudioDevice(AudioDevice, 0);
-        SDL_PauseAudioDevice(MicDevice, 0);
-
-        OSD::AddMessage(0, "Resumed");
-    }
-}
-
-void Reset(void* blarg)
-{
-    if (!RunningSomething) return;
-
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    SavestateLoaded = false;
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    if (ROMPath[0][0] == '\0')
-        NDS::LoadBIOS();
-    else
-    {
-        SetupSRAMPath(0);
-        NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot);
-    }
-
-    if (ROMPath[1][0] != '\0')
-    {
-        SetupSRAMPath(1);
-        NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-    }
-
-    Run();
-
-    OSD::AddMessage(0, "Reset");
-}
-
-void Stop(bool internal)
-{
-    EmuRunning = 2;
-    if (!internal) // if shutting down from the UI thread, wait till the emu thread has stopped
-        while (EmuStatus != 2);
-    RunningSomething = false;
-
-    // eject any inserted GBA cartridge
-    GBACart::Eject();
-    ROMPath[1][0] = '\0';
-
-    uiWindowSetTitle(MainWindow, "melonDS " MELONDS_VERSION);
-
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]);
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    uiMenuItemDisable(MenuItem_Pause);
-    uiMenuItemDisable(MenuItem_Reset);
-    uiMenuItemDisable(MenuItem_Stop);
-    uiMenuItemSetChecked(MenuItem_Pause, 0);
-
-    uiAreaQueueRedrawAll(MainDrawArea);
-
-    SPU::DrainOutput();
-    SDL_PauseAudioDevice(AudioDevice, 1);
-    SDL_PauseAudioDevice(MicDevice, 1);
-
-    OSD::AddMessage(0xFFC040, "Shutdown");
-}
-
-void SetupSRAMPath(int slot)
-{
-    strncpy(SRAMPath[slot], ROMPath[slot], 1023);
-    SRAMPath[slot][1023] = '\0';
-    strncpy(SRAMPath[slot] + strlen(ROMPath[slot]) - 3, "sav", 3);
-}
-
-void TryLoadROM(char* file, int slot, int prevstatus)
-{
-    char oldpath[1024];
-    char oldsram[1024];
-    strncpy(oldpath, ROMPath[slot], 1024);
-    strncpy(oldsram, SRAMPath[slot], 1024);
-
-    strncpy(ROMPath[slot], file, 1023);
-    ROMPath[slot][1023] = '\0';
-
-    SetupSRAMPath(0);
-    SetupSRAMPath(1);
-
-    if (slot == 0 && NDS::LoadROM(ROMPath[slot], SRAMPath[slot], Config::DirectBoot))
-    {
-        SavestateLoaded = false;
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-        // Reload the inserted GBA cartridge (if any)
-        if (ROMPath[1][0] != '\0') NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-
-        strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety
-        Run();
-    }
-    else if (slot == 1 && NDS::LoadGBAROM(ROMPath[slot], SRAMPath[slot]))
-    {
-        SavestateLoaded = false;
-        uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-        strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety
-        if (RunningSomething) Run(); // do not start just from a GBA cart
-    }
-    else
-    {
-        uiMsgBoxError(MainWindow,
-                      "Failed to load the ROM",
-                      "Make sure the file can be accessed and isn't opened in another application.");
-
-        strncpy(ROMPath[slot], oldpath, 1024);
-        strncpy(SRAMPath[slot], oldsram, 1024);
-        EmuRunning = prevstatus;
-    }
-}
-
-
-// SAVESTATE TODO
-// * configurable paths. not everyone wants their ROM directory to be polluted, I guess.
-
-void GetSavestateName(int slot, char* filename, int len)
-{
-    int pos;
-
-    if (ROMPath[0][0] == '\0') // running firmware, no ROM
-    {
-        strcpy(filename, "firmware");
-        pos = 8;
-    }
-    else
-    {
-        int l = strlen(ROMPath[0]);
-        pos = l;
-        while (ROMPath[0][pos] != '.' && pos > 0) pos--;
-        if (pos == 0) pos = l;
-
-        // avoid buffer overflow. shoddy
-        if (pos > len-5) pos = len-5;
-
-        strncpy(&filename[0], ROMPath[0], pos);
-    }
-    strcpy(&filename[pos], ".ml");
-    filename[pos+3] = '0'+slot;
-    filename[pos+4] = '\0';
-}
-
-void LoadState(int slot)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char filename[1024];
-
-    if (slot > 0)
-    {
-        GetSavestateName(slot, filename, 1024);
-    }
-    else
-    {
-        char* file = uiOpenFile(MainWindow, "melonDS savestate (any)|*.ml1;*.ml2;*.ml3;*.ml4;*.ml5;*.ml6;*.ml7;*.ml8;*.mln", Config::LastROMFolder);
-        if (!file)
-        {
-            EmuRunning = prevstatus;
-            return;
-        }
-
-        strncpy(filename, file, 1023);
-        filename[1023] = '\0';
-        uiFreeText(file);
-    }
-
-    if (!Platform::FileExists(filename))
-    {
-        char msg[64];
-        if (slot > 0) sprintf(msg, "State slot %d is empty", slot);
-        else          sprintf(msg, "State file does not exist");
-        OSD::AddMessage(0xFFA0A0, msg);
-
-        EmuRunning = prevstatus;
-        return;
-    }
-
-    u32 oldGBACartCRC = GBACart::CartCRC;
-
-    // backup
-    Savestate* backup = new Savestate("timewarp.mln", true);
-    NDS::DoSavestate(backup);
-    delete backup;
-
-    bool failed = false;
-
-    Savestate* state = new Savestate(filename, false);
-    if (state->Error)
-    {
-        delete state;
-
-        uiMsgBoxError(MainWindow, "Error", "Could not load savestate file.");
-
-        // current state might be crapoed, so restore from sane backup
-        state = new Savestate("timewarp.mln", false);
-        failed = true;
-    }
-
-    NDS::DoSavestate(state);
-    delete state;
-
-    if (!failed)
-    {
-        if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0')
-        {
-            strncpy(PrevSRAMPath[0], SRAMPath[0], 1024);
-
-            strncpy(SRAMPath[0], filename, 1019);
-            int len = strlen(SRAMPath[0]);
-            strcpy(&SRAMPath[0][len], ".sav");
-            SRAMPath[0][len+4] = '\0';
-
-            NDS::RelocateSave(SRAMPath[0], false);
-        }
-
-        bool loadedPartialGBAROM = false;
-
-        // in case we have a GBA cart inserted, and the GBA ROM changes
-        // due to having loaded a save state, we do not want to reload
-        // the previous cartridge on reset, or commit writes to any
-        // loaded save file. therefore, their paths are "nulled".
-        if (GBACart::CartInserted && GBACart::CartCRC != oldGBACartCRC)
-        {
-            ROMPath[1][0] = '\0';
-            SRAMPath[1][0] = '\0';
-            loadedPartialGBAROM = true;
-        }
-
-        char msg[64];
-        if (slot > 0) sprintf(msg, "State loaded from slot %d%s",
-                        slot, loadedPartialGBAROM ? " (GBA ROM header only)" : "");
-        else          sprintf(msg, "State loaded from file%s",
-                        loadedPartialGBAROM ? " (GBA ROM header only)" : "");
-        OSD::AddMessage(0, msg);
-
-        SavestateLoaded = true;
-        uiMenuItemEnable(MenuItem_UndoStateLoad);
-    }
-
-    EmuRunning = prevstatus;
-}
-
-void SaveState(int slot)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char filename[1024];
-
-    if (slot > 0)
-    {
-        GetSavestateName(slot, filename, 1024);
-    }
-    else
-    {
-        char* file = uiSaveFile(MainWindow, "melonDS savestate (*.mln)|*.mln", Config::LastROMFolder);
-        if (!file)
-        {
-            EmuRunning = prevstatus;
-            return;
-        }
-
-        strncpy(filename, file, 1023);
-        filename[1023] = '\0';
-        uiFreeText(file);
-    }
-
-    Savestate* state = new Savestate(filename, true);
-    if (state->Error)
-    {
-        delete state;
-
-        uiMsgBoxError(MainWindow, "Error", "Could not save state.");
-    }
-    else
-    {
-        NDS::DoSavestate(state);
-        delete state;
-
-        if (slot > 0)
-            uiMenuItemEnable(MenuItem_LoadStateSlot[slot-1]);
-
-        if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0')
-        {
-            strncpy(SRAMPath[0], filename, 1019);
-            int len = strlen(SRAMPath[0]);
-            strcpy(&SRAMPath[0][len], ".sav");
-            SRAMPath[0][len+4] = '\0';
-
-            NDS::RelocateSave(SRAMPath[0], true);
-        }
-    }
-
-    char msg[64];
-    if (slot > 0) sprintf(msg, "State saved to slot %d", slot);
-    else          sprintf(msg, "State saved to file");
-    OSD::AddMessage(0, msg);
-
-    EmuRunning = prevstatus;
-}
-
-void UndoStateLoad()
-{
-    if (!SavestateLoaded) return;
-
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    // pray that this works
-    // what do we do if it doesn't???
-    // but it should work.
-    Savestate* backup = new Savestate("timewarp.mln", false);
-    NDS::DoSavestate(backup);
-    delete backup;
-
-    if (ROMPath[0][0]!='\0')
-    {
-        strncpy(SRAMPath[0], PrevSRAMPath[0], 1024);
-        NDS::RelocateSave(SRAMPath[0], false);
-    }
-
-    OSD::AddMessage(0, "State load undone");
-
-    EmuRunning = prevstatus;
-}
-
-
-void CloseAllDialogs()
-{
-    DlgAudioSettings::Close();
-    DlgEmuSettings::Close();
-    DlgInputConfig::Close(0);
-    DlgInputConfig::Close(1);
-    DlgVideoSettings::Close();
-    DlgWifiSettings::Close();
-}
-
-
-int OnCloseWindow(uiWindow* window, void* blarg)
-{
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    CloseAllDialogs();
-    StopEmuThread();
-    uiQuit();
-    return 1;
-}
-
-void OnDropFile(uiWindow* window, char* file, void* blarg)
-{
-    char* ext = &file[strlen(file)-3];
-    int prevstatus = EmuRunning;
-
-    if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl"))
-    {
-        if (RunningSomething)
-        {
-            EmuRunning = 2;
-            while (EmuStatus != 2);
-        }
-
-        TryLoadROM(file, 0, prevstatus);
-    }
-    else if (!strcasecmp(ext, "gba"))
-    {
-        TryLoadROM(file, 1, prevstatus);
-    }
-}
-
-void OnGetFocus(uiWindow* window, void* blarg)
-{
-    uiControlSetFocus(uiControl(MainDrawArea));
-}
-
-void OnLoseFocus(uiWindow* window, void* blarg)
-{
-    // TODO: shit here?
-}
-
-void OnCloseByMenu(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    CloseAllDialogs();
-    StopEmuThread();
-    DestroyMainWindow();
-    uiQuit();
-}
-
-void OnOpenFile(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int prevstatus = EmuRunning;
-    EmuRunning = 2;
-    while (EmuStatus != 2);
-
-    char* file = uiOpenFile(window, "DS ROM (*.nds)|*.nds;*.srl|GBA ROM (*.gba)|*.gba|Any file|*.*", Config::LastROMFolder);
-    if (!file)
-    {
-        EmuRunning = prevstatus;
-        return;
-    }
-
-    int pos = strlen(file)-1;
-    while (file[pos] != '/' && file[pos] != '\\' && pos > 0) pos--;
-    strncpy(Config::LastROMFolder, file, pos);
-    Config::LastROMFolder[pos] = '\0';
-    char* ext = &file[strlen(file)-3];
-
-    if (!strcasecmp(ext, "gba"))
-    {
-        TryLoadROM(file, 1, prevstatus);
-    }
-    else
-    {
-        TryLoadROM(file, 0, prevstatus);
-    }
-
-    uiFreeText(file);
-}
-
-void OnSaveState(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int slot = *(int*)param;
-    SaveState(slot);
-}
-
-void OnLoadState(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int slot = *(int*)param;
-    LoadState(slot);
-}
-
-void OnUndoStateLoad(uiMenuItem* item, uiWindow* window, void* param)
-{
-    UndoStateLoad();
-}
-
-void OnRun(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    if (!RunningSomething)
-    {
-        ROMPath[0][0] = '\0';
-        NDS::LoadBIOS();
-
-        if (ROMPath[1][0] != '\0')
-        {
-            SetupSRAMPath(1);
-            NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-        }
-    }
-
-    Run();
-}
-
-void OnPause(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    TogglePause(NULL);
-}
-
-void OnReset(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    Reset(NULL);
-}
-
-void OnStop(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    if (!RunningSomething) return;
-
-    Stop(false);
-}
-
-void OnOpenEmuSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgEmuSettings::Open();
-}
-
-void OnOpenInputConfig(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgInputConfig::Open(0);
-}
-
-void OnOpenHotkeyConfig(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgInputConfig::Open(1);
-}
-
-void OnOpenVideoSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgVideoSettings::Open();
-}
-
-void OnOpenAudioSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgAudioSettings::Open();
-}
-
-void OnOpenWifiSettings(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    DlgWifiSettings::Open();
-}
-
-
-void OnSetSavestateSRAMReloc(uiMenuItem* item, uiWindow* window, void* param)
-{
-    Config::SavestateRelocSRAM = uiMenuItemChecked(item) ? 1:0;
-}
-
-
-void EnsureProperMinSize()
-{
-    bool isHori = (ScreenRotation == 1 || ScreenRotation == 3);
-
-    int w0 = 256;
-    int h0 = 192;
-    int w1 = 256;
-    int h1 = 192;
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori)
-            SetMinSize(h0+ScreenGap+h1, std::max(w0,w1));
-        else
-            SetMinSize(std::max(w0,w1), h0+ScreenGap+h1);
-    }
-    else if (ScreenLayout == 1) // vertical
-    {
-        if (isHori)
-            SetMinSize(std::max(h0,h1), w0+ScreenGap+w1);
-        else
-            SetMinSize(std::max(w0,w1), h0+ScreenGap+h1);
-    }
-    else // horizontal
-    {
-        if (isHori)
-            SetMinSize(h0+ScreenGap+h1, std::max(w0,w1));
-        else
-            SetMinSize(w0+ScreenGap+w1, std::max(h0,h1));
-    }
-}
-
-void OnSetScreenSize(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int factor = *(int*)param;
-    bool isHori = (ScreenRotation == 1 || ScreenRotation == 3);
-
-    int w = 256*factor;
-    int h = 192*factor;
-
-    // FIXME
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, (h*2)+ScreenGap, w);
-        else
-            uiWindowSetContentSize(window, w, (h*2)+ScreenGap);
-    }
-    else if (ScreenLayout == 1) // vertical
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, h, (w*2)+ScreenGap);
-        else
-            uiWindowSetContentSize(window, w, (h*2)+ScreenGap);
-    }
-    else // horizontal
-    {
-        if (isHori)
-            uiWindowSetContentSize(window, (h*2)+ScreenGap, w);
-        else
-            uiWindowSetContentSize(window, (w*2)+ScreenGap, h);
-    }
-}
-
-void OnSetScreenRotation(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int rot = *(int*)param;
-
-    int oldrot = ScreenRotation;
-    ScreenRotation = rot;
-
-    int w, h;
-    uiWindowContentSize(window, &w, &h);
-
-    bool isHori = (rot == 1 || rot == 3);
-    bool wasHori = (oldrot == 1 || oldrot == 3);
-
-    EnsureProperMinSize();
-
-    if (ScreenLayout == 0) // natural
-    {
-        if (isHori ^ wasHori)
-        {
-            int blarg = h;
-            h = w;
-            w = blarg;
-
-            uiWindowSetContentSize(window, w, h);
-        }
-    }
-
-    SetupScreenRects(w, h);
-
-    for (int i = 0; i < 4; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenRot[i], i==ScreenRotation);
-}
-
-void OnSetScreenGap(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int gap = *(int*)param;
-
-    //int oldgap = ScreenGap;
-    ScreenGap = gap;
-
-    EnsureProperMinSize();
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 6; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenGap[i], kScreenGap[i]==ScreenGap);
-}
-
-void OnSetScreenLayout(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int layout = *(int*)param;
-    ScreenLayout = layout;
-
-    EnsureProperMinSize();
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 3; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenLayout[i], i==ScreenLayout);
-}
-
-void OnSetScreenSizing(uiMenuItem* item, uiWindow* window, void* param)
-{
-    int sizing = *(int*)param;
-    ScreenSizing = sizing;
-
-    SetupScreenRects(WindowWidth, WindowHeight);
-
-    for (int i = 0; i < 4; i++)
-        uiMenuItemSetChecked(MenuItem_ScreenSizing[i], i==ScreenSizing);
-}
-
-void OnSetScreenFiltering(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::ScreenFilter = 1;
-    else          Config::ScreenFilter = 0;
-}
-
-void OnSetLimitFPS(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::LimitFPS = true;
-    else          Config::LimitFPS = false;
-}
-
-void OnSetAudioSync(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::AudioSync = true;
-    else          Config::AudioSync = false;
-}
-
-void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg)
-{
-    int chk = uiMenuItemChecked(item);
-    if (chk != 0) Config::ShowOSD = true;
-    else          Config::ShowOSD = false;
-}
-
-void ApplyNewSettings(int type)
-{
-#ifdef JIT_ENABLED
-    if (type == 4)
-    {
-        Reset(NULL);
-        return;
-    }
-#endif
-
-    if (!RunningSomething)
-    {
-        if (type == 1) return;
-    }
-
-    int prevstatus = EmuRunning;
-    EmuRunning = 3;
-    while (EmuStatus != 3);
-
-    if (type == 0) // 3D renderer settings
-    {
-        if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-        GPU3D::UpdateRendererConfig();
-        if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-        GL_3DScale = Config::GL_ScaleFactor; // dorp
-        GL_ScreenSizeDirty = true;
-    }
-    else if (type == 1) // wifi settings
-    {
-        if (Wifi::MPInited)
-        {
-            Platform::MP_DeInit();
-            Platform::MP_Init();
-        }
-
-        Platform::LAN_DeInit();
-        Platform::LAN_Init();
-    }
-    else if (type == 2) // video output method
-    {
-        bool usegl = Config::ScreenUseGL || (Config::_3DRenderer != 0);
-        if (usegl != Screen_UseGL)
-        {
-            if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-            GPU3D::DeInitRenderer();
-            OSD::DeInit(Screen_UseGL);
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-
-            Screen_UseGL = usegl;
-            RecreateMainWindow(usegl);
-
-            if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-            GPU3D::InitRenderer(Screen_UseGL);
-            if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-        }
-    }
-    else if (type == 3) // 3D renderer
-    {
-        if (Screen_UseGL) uiGLMakeContextCurrent(GLContext);
-        GPU3D::DeInitRenderer();
-        GPU3D::InitRenderer(Screen_UseGL);
-        if (Screen_UseGL) uiGLMakeContextCurrent(NULL);
-    }
-    EmuRunning = prevstatus;
-}
-
-
-void CreateMainWindowMenu()
-{
-    uiMenu* menu;
-    uiMenuItem* menuitem;
-
-    menu = uiNewMenu("File");
-    menuitem = uiMenuAppendItem(menu, "Open ROM...");
-    uiMenuItemOnClicked(menuitem, OnOpenFile, NULL);
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Save state");
-
-        for (int i = 0; i < 9; i++)
-        {
-            char name[32];
-            if (i < 8)
-                sprintf(name, "%d\tShift+F%d", kSavestateNum[i], kSavestateNum[i]);
-            else
-                strcpy(name, "File...\tShift+F9");
-
-            uiMenuItem* ssitem = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(ssitem, OnSaveState, (void*)&kSavestateNum[i]);
-
-            MenuItem_SaveStateSlot[i] = ssitem;
-        }
-
-        MenuItem_SaveState = uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Load state");
-
-        for (int i = 0; i < 9; i++)
-        {
-            char name[32];
-            if (i < 8)
-                sprintf(name, "%d\tF%d", kSavestateNum[i], kSavestateNum[i]);
-            else
-                strcpy(name, "File...\tF9");
-
-            uiMenuItem* ssitem = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(ssitem, OnLoadState, (void*)&kSavestateNum[i]);
-
-            MenuItem_LoadStateSlot[i] = ssitem;
-        }
-
-        MenuItem_LoadState = uiMenuAppendSubmenu(menu, submenu);
-    }
-    menuitem = uiMenuAppendItem(menu, "Undo state load\tF12");
-    uiMenuItemOnClicked(menuitem, OnUndoStateLoad, NULL);
-    MenuItem_UndoStateLoad = menuitem;
-    uiMenuAppendSeparator(menu);
-    menuitem = uiMenuAppendItem(menu, "Quit");
-    uiMenuItemOnClicked(menuitem, OnCloseByMenu, NULL);
-
-    menu = uiNewMenu("System");
-    menuitem = uiMenuAppendItem(menu, "Run");
-    uiMenuItemOnClicked(menuitem, OnRun, NULL);
-    menuitem = uiMenuAppendCheckItem(menu, "Pause");
-    uiMenuItemOnClicked(menuitem, OnPause, NULL);
-    MenuItem_Pause = menuitem;
-    uiMenuAppendSeparator(menu);
-    menuitem = uiMenuAppendItem(menu, "Reset");
-    uiMenuItemOnClicked(menuitem, OnReset, NULL);
-    MenuItem_Reset = menuitem;
-    menuitem = uiMenuAppendItem(menu, "Stop");
-    uiMenuItemOnClicked(menuitem, OnStop, NULL);
-    MenuItem_Stop = menuitem;
-
-    menu = uiNewMenu("Config");
-    {
-        menuitem = uiMenuAppendItem(menu, "Emu settings");
-        uiMenuItemOnClicked(menuitem, OnOpenEmuSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Input config");
-        uiMenuItemOnClicked(menuitem, OnOpenInputConfig, NULL);
-        menuitem = uiMenuAppendItem(menu, "Hotkey config");
-        uiMenuItemOnClicked(menuitem, OnOpenHotkeyConfig, NULL);
-        menuitem = uiMenuAppendItem(menu, "Video settings");
-        uiMenuItemOnClicked(menuitem, OnOpenVideoSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Audio settings");
-        uiMenuItemOnClicked(menuitem, OnOpenAudioSettings, NULL);
-        menuitem = uiMenuAppendItem(menu, "Wifi settings");
-        uiMenuItemOnClicked(menuitem, OnOpenWifiSettings, NULL);
-    }
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Savestate settings");
-
-        MenuItem_SavestateSRAMReloc = uiMenuAppendCheckItem(submenu, "Separate savefiles");
-        uiMenuItemOnClicked(MenuItem_SavestateSRAMReloc, OnSetSavestateSRAMReloc, NULL);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    uiMenuAppendSeparator(menu);
-    {
-        uiMenu* submenu = uiNewMenu("Screen size");
-
-        for (int i = 0; i < 4; i++)
-        {
-            char name[32];
-            sprintf(name, "%dx", kScreenSize[i]);
-            uiMenuItem* item = uiMenuAppendItem(submenu, name);
-            uiMenuItemOnClicked(item, OnSetScreenSize, (void*)&kScreenSize[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen rotation");
-
-        for (int i = 0; i < 4; i++)
-        {
-            char name[32];
-            sprintf(name, "%d", kScreenRot[i]*90);
-            MenuItem_ScreenRot[i] = uiMenuAppendCheckItem(submenu, name);
-            uiMenuItemOnClicked(MenuItem_ScreenRot[i], OnSetScreenRotation, (void*)&kScreenRot[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Mid-screen gap");
-
-        //for (int i = 0; kScreenGap[i] != -1; i++)
-        for (int i = 0; i < 6; i++)
-        {
-            char name[32];
-            sprintf(name, "%d pixels", kScreenGap[i]);
-            MenuItem_ScreenGap[i] = uiMenuAppendCheckItem(submenu, name);
-            uiMenuItemOnClicked(MenuItem_ScreenGap[i], OnSetScreenGap, (void*)&kScreenGap[i]);
-        }
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen layout");
-
-        MenuItem_ScreenLayout[0] = uiMenuAppendCheckItem(submenu, "Natural");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[0], OnSetScreenLayout, (void*)&kScreenLayout[0]);
-        MenuItem_ScreenLayout[1] = uiMenuAppendCheckItem(submenu, "Vertical");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[1], OnSetScreenLayout, (void*)&kScreenLayout[1]);
-        MenuItem_ScreenLayout[2] = uiMenuAppendCheckItem(submenu, "Horizontal");
-        uiMenuItemOnClicked(MenuItem_ScreenLayout[2], OnSetScreenLayout, (void*)&kScreenLayout[2]);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-    {
-        uiMenu* submenu = uiNewMenu("Screen sizing");
-
-        MenuItem_ScreenSizing[0] = uiMenuAppendCheckItem(submenu, "Even");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[0], OnSetScreenSizing, (void*)&kScreenSizing[0]);
-        MenuItem_ScreenSizing[1] = uiMenuAppendCheckItem(submenu, "Emphasize top");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[1], OnSetScreenSizing, (void*)&kScreenSizing[1]);
-        MenuItem_ScreenSizing[2] = uiMenuAppendCheckItem(submenu, "Emphasize bottom");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[2], OnSetScreenSizing, (void*)&kScreenSizing[2]);
-        MenuItem_ScreenSizing[3] = uiMenuAppendCheckItem(submenu, "Auto");
-        uiMenuItemOnClicked(MenuItem_ScreenSizing[3], OnSetScreenSizing, (void*)&kScreenSizing[3]);
-
-        uiMenuAppendSubmenu(menu, submenu);
-    }
-
-    MenuItem_ScreenFilter = uiMenuAppendCheckItem(menu, "Screen filtering");
-    uiMenuItemOnClicked(MenuItem_ScreenFilter, OnSetScreenFiltering, NULL);
-
-    MenuItem_ShowOSD = uiMenuAppendCheckItem(menu, "Show OSD");
-    uiMenuItemOnClicked(MenuItem_ShowOSD, OnSetShowOSD, NULL);
-
-    uiMenuAppendSeparator(menu);
-
-    MenuItem_LimitFPS = uiMenuAppendCheckItem(menu, "Limit framerate");
-    uiMenuItemOnClicked(MenuItem_LimitFPS, OnSetLimitFPS, NULL);
-
-    MenuItem_AudioSync = uiMenuAppendCheckItem(menu, "Audio sync");
-    uiMenuItemOnClicked(MenuItem_AudioSync, OnSetAudioSync, NULL);
-}
-
-void CreateMainWindow(bool opengl)
-{
-    MainWindow = uiNewWindow("melonDS " MELONDS_VERSION,
-                             WindowWidth, WindowHeight,
-                             Config::WindowMaximized, 1, 1);
-    uiWindowOnClosing(MainWindow, OnCloseWindow, NULL);
-
-    uiWindowSetDropTarget(MainWindow, 1);
-    uiWindowOnDropFile(MainWindow, OnDropFile, NULL);
-
-    uiWindowOnGetFocus(MainWindow, OnGetFocus, NULL);
-    uiWindowOnLoseFocus(MainWindow, OnLoseFocus, NULL);
-
-    ScreenDrawInited = false;
-    bool opengl_good = opengl;
-
-    if (!opengl) MainDrawArea = uiNewArea(&MainDrawAreaHandler);
-    else         MainDrawArea = uiNewGLArea(&MainDrawAreaHandler, kGLVersions);
-
-    uiWindowSetChild(MainWindow, uiControl(MainDrawArea));
-    uiControlSetMinSize(uiControl(MainDrawArea), 256, 384);
-    uiAreaSetBackgroundColor(MainDrawArea, 0, 0, 0);
-
-    uiControlShow(uiControl(MainWindow));
-    uiControlSetFocus(uiControl(MainDrawArea));
-
-    if (opengl_good)
-    {
-        GLContext = uiAreaGetGLContext(MainDrawArea);
-        if (!GLContext) opengl_good = false;
-    }
-    if (opengl_good)
-    {
-        uiGLMakeContextCurrent(GLContext);
-        uiGLSetVSync(Config::ScreenVSync);
-        if (!GLScreen_Init()) opengl_good = false;
-        if (opengl_good)
-        {
-            OpenGL_UseShaderProgram(GL_ScreenShaderOSD);
-            OSD::Init(true);
-        }
-        uiGLMakeContextCurrent(NULL);
-    }
-
-    if (opengl && !opengl_good)
-    {
-        printf("OpenGL: initialization failed\n");
-        RecreateMainWindow(false);
-        Screen_UseGL = false;
-    }
-
-    if (!opengl) OSD::Init(false);
-}
-
-void DestroyMainWindow()
-{
-    uiControlDestroy(uiControl(MainWindow));
-
-    if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-    if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-    ScreenBitmap[0] = NULL;
-    ScreenBitmap[1] = NULL;
-}
-
-void RecreateMainWindow(bool opengl)
-{
-    int winX, winY, maxi;
-    uiWindowPosition(MainWindow, &winX, &winY);
-    maxi = uiWindowMaximized(MainWindow);
-    DestroyMainWindow();
-    CreateMainWindow(opengl);
-    uiWindowSetPosition(MainWindow, winX, winY);
-    uiWindowSetMaximized(MainWindow, maxi);
-}
-
-
-int main(int argc, char** argv)
-{
-    srand(time(NULL));
-
-    printf("melonDS " MELONDS_VERSION "\n");
-    printf(MELONDS_URL "\n");
-
-#if defined(__WIN32__) || defined(UNIX_PORTABLE)
-    if (argc > 0 && strlen(argv[0]) > 0)
-    {
-        int len = strlen(argv[0]);
-        while (len > 0)
-        {
-            if (argv[0][len] == '/') break;
-            if (argv[0][len] == '\\') break;
-            len--;
-        }
-        if (len > 0)
-        {
-            EmuDirectory = new char[len+1];
-            strncpy(EmuDirectory, argv[0], len);
-            EmuDirectory[len] = '\0';
-        }
-        else
-        {
-            EmuDirectory = new char[2];
-            strcpy(EmuDirectory, ".");
-        }
-    }
-    else
-    {
-        EmuDirectory = new char[2];
-        strcpy(EmuDirectory, ".");
-    }
-#else
-	const char* confdir = g_get_user_config_dir();
-	const char* confname = "/melonDS";
-	EmuDirectory = new char[strlen(confdir) + strlen(confname) + 1];
-	strcat(EmuDirectory, confdir);
-	strcat(EmuDirectory, confname);
-#endif
-
-    // http://stackoverflow.com/questions/14543333/joystick-wont-work-using-sdl
-    SDL_SetHint(SDL_HINT_JOYSTICK_ALLOW_BACKGROUND_EVENTS, "1");
-
-    if (SDL_Init(SDL_INIT_HAPTIC) < 0)
-    {
-        printf("SDL couldn't init rumble\n");
-    }
-    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_JOYSTICK) < 0)
-    {
-        printf("SDL shat itself :(\n");
-        return 1;
-    }
-
-    SDL_JoystickEventState(SDL_ENABLE);
-
-    uiInitOptions ui_opt;
-    memset(&ui_opt, 0, sizeof(uiInitOptions));
-    const char* ui_err = uiInit(&ui_opt);
-    if (ui_err != NULL)
-    {
-        printf("libui shat itself :( %s\n", ui_err);
-        uiFreeInitError(ui_err);
-        return 1;
-    }
-
-    Config::Load();
-
-    if      (Config::AudioVolume < 0)   Config::AudioVolume = 0;
-    else if (Config::AudioVolume > 256) Config::AudioVolume = 256;
-
-    if (!Platform::LocalFileExists("bios7.bin") ||
-        !Platform::LocalFileExists("bios9.bin") ||
-        !Platform::LocalFileExists("firmware.bin"))
-    {
-#if defined(__WIN32__) || defined(UNIX_PORTABLE)
-		const char* locationName = "the directory you run melonDS from";
-#else
-		char* locationName = EmuDirectory;
-#endif
-		char msgboxtext[512];
-		sprintf(msgboxtext,
-            "One or more of the following required files don't exist or couldn't be accessed:\n\n"
-            "bios7.bin -- ARM7 BIOS\n"
-            "bios9.bin -- ARM9 BIOS\n"
-            "firmware.bin -- firmware image\n\n"
-            "Dump the files from your DS and place them in %s.\n"
-            "Make sure that the files can be accessed.",
-			locationName
-		);
-
-        uiMsgBoxError(NULL, "BIOS/Firmware not found", msgboxtext);
-
-        uiUninit();
-        SDL_Quit();
-        return 0;
-    }
-    if (!Platform::LocalFileExists("firmware.bin.bak"))
-    {
-        // verify the firmware
-        //
-        // there are dumps of an old hacked firmware floating around on the internet
-        // and those are problematic
-        // the hack predates WFC, and, due to this, any game that alters the WFC
-        // access point data will brick that firmware due to it having critical
-        // data in the same area. it has the same problem on hardware.
-        //
-        // but this should help stop users from reporting that issue over and over
-        // again, when the issue is not from melonDS but from their firmware dump.
-        //
-        // I don't know about all the firmware hacks in existence, but the one I
-        // looked at has 0x180 bytes from the header repeated at 0x3FC80, but
-        // bytes 0x0C-0x14 are different.
-
-        FILE* f = Platform::OpenLocalFile("firmware.bin", "rb");
-        u8 chk1[0x180], chk2[0x180];
-
-        fseek(f, 0, SEEK_SET);
-        fread(chk1, 1, 0x180, f);
-        fseek(f, -0x380, SEEK_END);
-        fread(chk2, 1, 0x180, f);
-
-        memset(&chk1[0x0C], 0, 8);
-        memset(&chk2[0x0C], 0, 8);
-
-        fclose(f);
-
-        if (!memcmp(chk1, chk2, 0x180))
-        {
-            uiMsgBoxError(NULL,
-                          "Problematic firmware dump",
-                          "You are using an old hacked firmware dump.\n"
-                          "Firmware boot will stop working if you run any game that alters WFC settings.\n\n"
-                          "Note that the issue is not from melonDS, it would also happen on an actual DS.");
-        }
-    }
-    {
-        const char* romlist_missing = "Save memory type detection will not work correctly.\n\n"
-            "You should use the latest version of romlist.bin (provided in melonDS release packages).";
-#if !defined(UNIX_PORTABLE) && !defined(__WIN32__)
-        std::string missingstr = std::string(romlist_missing) +
-            "\n\nThe ROM list should be placed in " + g_get_user_data_dir() + "/melonds/, otherwise "
-            "melonDS will search for it in the current working directory.";
-        const char* romlist_missing_text = missingstr.c_str();
-#else
-        const char* romlist_missing_text = romlist_missing;
-#endif
-
-        FILE* f = Platform::OpenDataFile("romlist.bin");
-        if (f)
-        {
-            u32 data;
-            fread(&data, 4, 1, f);
-            fclose(f);
-
-            if ((data >> 24) == 0) // old CRC-based list
-            {
-                uiMsgBoxError(NULL, "Your version of romlist.bin is outdated.", romlist_missing_text);
-            }
-        }
-        else
-        {
-        	uiMsgBoxError(NULL, "romlist.bin not found.", romlist_missing_text);
-        }
-    }
-
-    CreateMainWindowMenu();
-
-    MainDrawAreaHandler.Draw = OnAreaDraw;
-    MainDrawAreaHandler.MouseEvent = OnAreaMouseEvent;
-    MainDrawAreaHandler.MouseCrossed = OnAreaMouseCrossed;
-    MainDrawAreaHandler.DragBroken = OnAreaDragBroken;
-    MainDrawAreaHandler.KeyEvent = OnAreaKeyEvent;
-    MainDrawAreaHandler.Resize = OnAreaResize;
-
-    WindowWidth = Config::WindowWidth;
-    WindowHeight = Config::WindowHeight;
-
-    Screen_UseGL = Config::ScreenUseGL || (Config::_3DRenderer != 0);
-
-    GL_3DScale = Config::GL_ScaleFactor;
-    if      (GL_3DScale < 1) GL_3DScale = 1;
-    else if (GL_3DScale > 8) GL_3DScale = 8;
-
-    CreateMainWindow(Screen_UseGL);
-
-    ScreenRotation = Config::ScreenRotation;
-    ScreenGap = Config::ScreenGap;
-    ScreenLayout = Config::ScreenLayout;
-    ScreenSizing = Config::ScreenSizing;
-
-#define SANITIZE(var, min, max)  if ((var < min) || (var > max)) var = 0;
-    SANITIZE(ScreenRotation, 0, 3);
-    SANITIZE(ScreenLayout, 0, 2);
-    SANITIZE(ScreenSizing, 0, 3);
-#undef SANITIZE
-
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]);
-    for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]);
-    uiMenuItemDisable(MenuItem_UndoStateLoad);
-
-    uiMenuItemDisable(MenuItem_Pause);
-    uiMenuItemDisable(MenuItem_Reset);
-    uiMenuItemDisable(MenuItem_Stop);
-
-    uiMenuItemSetChecked(MenuItem_SavestateSRAMReloc, Config::SavestateRelocSRAM?1:0);
-
-    uiMenuItemSetChecked(MenuItem_ScreenRot[ScreenRotation], 1);
-    uiMenuItemSetChecked(MenuItem_ScreenLayout[ScreenLayout], 1);
-    uiMenuItemSetChecked(MenuItem_ScreenSizing[ScreenSizing], 1);
-
-    for (int i = 0; i < 6; i++)
-    {
-        if (ScreenGap == kScreenGap[i])
-            uiMenuItemSetChecked(MenuItem_ScreenGap[i], 1);
-    }
-
-    OnSetScreenRotation(MenuItem_ScreenRot[ScreenRotation], MainWindow, (void*)&kScreenRot[ScreenRotation]);
-
-    uiMenuItemSetChecked(MenuItem_ScreenFilter, Config::ScreenFilter==1);
-    uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1);
-    uiMenuItemSetChecked(MenuItem_AudioSync, Config::AudioSync==1);
-    uiMenuItemSetChecked(MenuItem_ShowOSD, Config::ShowOSD==1);
-
-#ifdef MELONCAP
-    MelonCap::Init();
-#endif // MELONCAP
-
-    AudioSync = SDL_CreateCond();
-    AudioSyncLock = SDL_CreateMutex();
-
-    AudioFreq = 48000; // TODO: make configurable?
-    SDL_AudioSpec whatIwant, whatIget;
-    memset(&whatIwant, 0, sizeof(SDL_AudioSpec));
-    whatIwant.freq = AudioFreq;
-    whatIwant.format = AUDIO_S16LSB;
-    whatIwant.channels = 2;
-    whatIwant.samples = 1024;
-    whatIwant.callback = AudioCallback;
-    AudioDevice = SDL_OpenAudioDevice(NULL, 0, &whatIwant, &whatIget, SDL_AUDIO_ALLOW_FREQUENCY_CHANGE);
-    if (!AudioDevice)
-    {
-        printf("Audio init failed: %s\n", SDL_GetError());
-    }
-    else
-    {
-        AudioFreq = whatIget.freq;
-        printf("Audio output frequency: %d Hz\n", AudioFreq);
-        SDL_PauseAudioDevice(AudioDevice, 1);
-    }
-
-    memset(&whatIwant, 0, sizeof(SDL_AudioSpec));
-    whatIwant.freq = 44100;
-    whatIwant.format = AUDIO_S16LSB;
-    whatIwant.channels = 1;
-    whatIwant.samples = 1024;
-    whatIwant.callback = MicCallback;
-    MicDevice = SDL_OpenAudioDevice(NULL, 1, &whatIwant, &whatIget, 0);
-    if (!MicDevice)
-    {
-        printf("Mic init failed: %s\n", SDL_GetError());
-        MicBufferLength = 0;
-    }
-    else
-    {
-        SDL_PauseAudioDevice(MicDevice, 1);
-    }
-
-    memset(MicBuffer, 0, sizeof(MicBuffer));
-    MicBufferReadPos = 0;
-    MicBufferWritePos = 0;
-
-    MicWavBuffer = NULL;
-    if (Config::MicInputType == 3) MicLoadWav(Config::MicWavPath);
-
-    JoystickID = Config::JoystickID;
-    Joystick = NULL;
-    OpenJoystick();
-
-    EmuRunning = 2;
-    RunningSomething = false;
-    EmuThread = SDL_CreateThread(EmuThreadFunc, "melonDS magic", NULL);
-
-    if (argc > 1)
-    {
-        char* file = argv[1];
-        char* ext = &file[strlen(file)-3];
-
-        if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl"))
-        {
-            strncpy(ROMPath[0], file, 1023);
-            ROMPath[0][1023] = '\0';
-
-            SetupSRAMPath(0);
-
-            if (NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot))
-                Run();
-        }
-
-        if (argc > 2)
-        {
-            file = argv[2];
-            ext = &file[strlen(file)-3];
-
-            if (!strcasecmp(ext, "gba"))
-            {
-                strncpy(ROMPath[1], file, 1023);
-                ROMPath[1][1023] = '\0';
-
-                SetupSRAMPath(1);
-
-                NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]);
-            }
-        }
-    }
-
-    uiMain();
-
-    if (Joystick) SDL_JoystickClose(Joystick);
-    if (AudioDevice) SDL_CloseAudioDevice(AudioDevice);
-    if (MicDevice)   SDL_CloseAudioDevice(MicDevice);
-
-    SDL_DestroyCond(AudioSync);
-    SDL_DestroyMutex(AudioSyncLock);
-
-    if (MicWavBuffer) delete[] MicWavBuffer;
-
-#ifdef MELONCAP
-    MelonCap::DeInit();
-#endif // MELONCAP
-
-    if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]);
-    if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]);
-
-    Config::ScreenRotation = ScreenRotation;
-    Config::ScreenGap = ScreenGap;
-    Config::ScreenLayout = ScreenLayout;
-    Config::ScreenSizing = ScreenSizing;
-
-    Config::Save();
-
-    uiUninit();
-    SDL_Quit();
-    delete[] EmuDirectory;
-    return 0;
-}
-
-#ifdef __WIN32__
-
-#include <windows.h>
-
-int CALLBACK WinMain(HINSTANCE hinst, HINSTANCE hprev, LPSTR cmdline, int cmdshow)
-{
-    int argc = 0;
-    wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc);
-    char* nullarg = "";
-
-    char** argv = new char*[argc];
-    for (int i = 0; i < argc; i++)
-    {
-        int len = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, NULL, 0, NULL, NULL);
-        if (len < 1) return NULL;
-        argv[i] = new char[len];
-        int res = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, argv[i], len, NULL, NULL);
-        if (res != len) { delete[] argv[i]; argv[i] = nullarg; }
-    }
-
-    if (AttachConsole(ATTACH_PARENT_PROCESS))
-    {
-        freopen("CONOUT$", "w", stdout);
-        freopen("CONOUT$", "w", stderr);
-        printf("\n");
-    }
-
-    int ret = main(argc, argv);
-
-    printf("\n\n>");
-
-    for (int i = 0; i < argc; i++) if (argv[i] != nullarg) delete[] argv[i];
-    delete[] argv;
-
-    return ret;
-}
-
-#endif
-- 
cgit v1.2.3