From 9d76d63af5d496e232018d6ddf8ee1e55ad440ad Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 04:33:36 +0200 Subject: jit: make everything configurable --- src/Config.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/Config.h') diff --git a/src/Config.h b/src/Config.h index 84fd57b..18a7910 100644 --- a/src/Config.h +++ b/src/Config.h @@ -46,6 +46,9 @@ extern int Threaded3D; extern int GL_ScaleFactor; extern int GL_Antialias; +extern bool JIT_Enable; +extern int JIT_MaxBlockSize; + } #endif // CONFIG_H -- cgit v1.2.3 From 411fb57c07c732a2b60e3566ae045f8f60eea29d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 19:24:00 +0200 Subject: jit: add compile option --- CMakeLists.txt | 30 +++++++++++++++++++ src/ARM.cpp | 13 ++++---- src/ARM.h | 6 ++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 - src/CMakeLists.txt | 25 +++++++++------- src/CP15.cpp | 12 ++++++-- src/Config.cpp | 4 +++ src/Config.h | 2 ++ src/NDS.cpp | 26 ++++++++++++++++ src/dolphin/CodeBlock.h | 3 -- src/libui_sdl/DlgEmuSettings.cpp | 21 +++++++++++-- src/libui_sdl/main.cpp | 2 ++ 13 files changed, 151 insertions(+), 55 deletions(-) (limited to 'src/Config.h') diff --git a/CMakeLists.txt b/CMakeLists.txt index 048dd44..d59e19c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,36 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +include(CheckSymbolExists) +function(detect_architecture symbol arch) + if (NOT DEFINED ARCHITECTURE) + set(CMAKE_REQUIRED_QUIET 1) + check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch}) + unset(CMAKE_REQUIRED_QUIET) + + # The output variable needs to be unique across invocations otherwise + # CMake's crazy scope rules will keep it defined + if (ARCHITECTURE_${arch}) + set(ARCHITECTURE "${arch}" PARENT_SCOPE) + set(ARCHITECTURE_${arch} 1 PARENT_SCOPE) + add_definitions(-DARCHITECTURE_${arch}=1) + endif() + endif() +endfunction() + +detect_architecture("__x86_64__" x86_64) +detect_architecture("__i386__" x86) +detect_architecture("__arm__" ARM) +detect_architecture("__aarch64__" ARM64) + +if (ARCHITECTURE STREQUAL x86_64) + option(ENABLE_JIT "Enable x64 JIT recompiler" ON) +endif() + +if (ENABLE_JIT) + add_definitions(-DJIT_ENABLED) +endif() + if (CMAKE_BUILD_TYPE STREQUAL Release) option(ENABLE_LTO "Enable link-time optimization" ON) else() diff --git a/src/ARM.cpp b/src/ARM.cpp index 6cc80c0..eb58d02 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -80,15 +80,8 @@ ARMv4::ARMv4() : ARM(1) // } -namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];} - void ARM::Reset() { - FILE* blabla = fopen("fhhg", "w"); - for (int i = 0; i < ARMInstrInfo::ak_Count; i++) - fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]); - fclose(blabla); - Cycles = 0; Halted = 0; @@ -548,6 +541,7 @@ void ARMv5::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv5::ExecuteJIT() { if (Halted) @@ -599,6 +593,7 @@ void ARMv5::ExecuteJIT() if (Halted == 2) Halted = 0; } +#endif void ARMv4::Execute() { @@ -677,6 +672,7 @@ void ARMv4::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv4::ExecuteJIT() { if (Halted) @@ -728,4 +724,5 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; -} \ No newline at end of file +} +#endif \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index 0544301..ecdf5b4 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -52,7 +52,9 @@ public: } virtual void Execute() = 0; +#ifdef ENABLE_JIT virtual void ExecuteJIT() = 0; +#endif bool CheckCondition(u32 code) { @@ -152,7 +154,9 @@ public: void DataAbort(); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -271,7 +275,9 @@ public: void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif u16 CodeRead16(u32 addr) { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fe23859..18cb27e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -4,7 +4,10 @@ #include +#include "../dolphin/CommonFuncs.h" + #ifdef _WIN32 +#include #else #include #include @@ -32,8 +35,6 @@ const int RegisterCache::NativeRegsAvailable = #endif ; -int instructionPopularityARM[ARMInstrInfo::ak_Count]; - /* We'll repurpose this .bss memory @@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32]; Compiler::Compiler() { -#ifdef _WIN32 -#else - u64 pagesize = sysconf(_SC_PAGE_SIZE); -#endif - - u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize); - u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned; - -#ifdef _WIN32 -#else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); -#endif - - region = pageAligned; - region_size = alignedSize; - total_region_size = region_size; + { + #ifdef _WIN32 + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + + u64 pageSize = (u64)sysInfo.dwPageSize; + #else + u64 pageSize = sysconf(_SC_PAGE_SIZE); + #endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; + + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + + region = pageAligned; + region_size = alignedSize; + total_region_size = region_size; + } ClearCodeSpace(); - SetCodePtr(pageAligned); - - memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); - for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) @@ -118,7 +123,7 @@ Compiler::Compiler() SetJumpTarget(und); MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); RET(); - } + } { // RSCRATCH mode // ABI_PARAM2 reg n @@ -163,7 +168,10 @@ Compiler::Compiler() RET(); } - ResetStart = (void*)GetWritableCodePtr(); + // move the region forward to prevent overwriting the generated functions + region_size -= GetWritableCodePtr() - region; + total_region_size = region_size; + region = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { void Compiler::Reset() { - SetCodePtr((u8*)ResetStart); + ClearCodeSpace(); } CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) @@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; - if (!Thumb) - instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; - if (comp == NULL || i == instrsCount - 1) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index cd58012..0ce7d8d 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -132,7 +132,6 @@ public: return Gen::R(RegCache.Mapping[reg]); } - void* ResetStart; void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9401220..10428aa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,19 +30,22 @@ add_library(core STATIC SPU.cpp Wifi.cpp WifiAP.cpp +) - ARMJIT.cpp - ARMJIT_x64/ARMJIT_Compiler.cpp - ARMJIT_x64/ARMJIT_ALU.cpp - ARMJIT_x64/ARMJIT_LoadStore.cpp - ARMJIT_x64/ARMJIT_Branch.cpp +if (ENABLE_JIT) + target_sources(core PRIVATE + ARMJIT.cpp + ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp - dolphin/CommonFuncs.cpp - dolphin/x64ABI.cpp - dolphin/x64CPUDetect.cpp - dolphin/x64Emitter.cpp - dolphin/MemoryUtil.cpp -) + dolphin/CommonFuncs.cpp + dolphin/x64ABI.cpp + dolphin/x64CPUDetect.cpp + dolphin/x64Emitter.cpp + ) +endif() if (WIN32) target_link_libraries(core ole32 comctl32 ws2_32 opengl32) diff --git a/src/CP15.cpp b/src/CP15.cpp index f232bec..e6e91c3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -812,7 +812,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -834,7 +836,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val) { DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -856,8 +860,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val) { DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -879,8 +885,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL; +#ifdef JIT_ENABLED + ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) diff --git a/src/Config.cpp b/src/Config.cpp index 37b701c..3cff0ed 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -34,8 +34,10 @@ int Threaded3D; int GL_ScaleFactor; int GL_Antialias; +#ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +#endif ConfigEntry ConfigFile[] = { @@ -45,8 +47,10 @@ ConfigEntry ConfigFile[] = {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0}, {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0}, +#ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, +#endif {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 18a7910..c13eae3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -46,8 +46,10 @@ extern int Threaded3D; extern int GL_ScaleFactor; extern int GL_Antialias; +#ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +#endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 4b50d9c..62a52aa 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -162,7 +162,9 @@ bool Init() ARM9 = new ARMv5(); ARM7 = new ARMv4(); +#ifdef JIT_ENABLED ARMJIT::Init(); +#endif DMAs[0] = new DMA(0, 0); DMAs[1] = new DMA(0, 1); @@ -194,7 +196,9 @@ void DeInit() delete ARM9; delete ARM7; +#ifdef JIT_ENABLED ARMJIT::DeInit(); +#endif for (int i = 0; i < 8; i++) delete DMAs[i]; @@ -524,7 +528,9 @@ void Reset() KeyCnt = 0; RCnt = 0; +#ifdef JIT_ENABLED ARMJIT::InvalidateBlockCache(); +#endif NDSCart::Reset(); GBACart::Reset(); @@ -741,10 +747,12 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } +#ifdef JIT_ENABLED if (!file->Saving) { ARMJIT::InvalidateBlockCache(); } +#endif return true; } @@ -864,9 +872,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM9->ExecuteJIT(); else +#endif ARM9->Execute(); } @@ -889,9 +899,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM7->ExecuteJIT(); else +#endif ARM7->Execute(); } @@ -924,9 +936,11 @@ u32 RunFrame() u32 RunFrame() { +#ifdef JIT_ENABLED if (Config::JIT_Enable) return RunFrame(); else +#endif return RunFrame(); } @@ -1849,7 +1863,9 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -1901,7 +1917,9 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -1969,7 +1987,9 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(0, addr); +#endif switch (addr & 0xFF000000) { @@ -2264,7 +2284,9 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2325,7 +2347,9 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2396,7 +2420,9 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(1, addr); +#endif switch (addr & 0xFF800000) { diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h index 31a8d93..e71cf6d 100644 --- a/src/dolphin/CodeBlock.h +++ b/src/dolphin/CodeBlock.h @@ -9,7 +9,6 @@ #include "Assert.h" #include "../types.h" -#include "MemoryUtil.h" namespace Common { @@ -41,8 +40,6 @@ public: CodeBlock() = default; virtual ~CodeBlock() { - if (region) - FreeCodeSpace(); } CodeBlock(const CodeBlock&) = delete; CodeBlock& operator=(const CodeBlock&) = delete; diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 116d2da..46f5f9f 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -38,8 +38,10 @@ uiWindow* win; uiCheckbox* cbDirectBoot; +#ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +#endif int OnCloseWindow(uiWindow* window, void* blarg) { @@ -57,13 +59,17 @@ void OnOk(uiButton* btn, void* blarg) { Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); +#ifdef JIT_ENABLED Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled); - long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10); + char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); + long blockSize = strtol(maxBlockSizeStr, NULL, 10); + uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; Config::JIT_MaxBlockSize = blockSize; +#endif Config::Save(); @@ -73,6 +79,7 @@ void OnOk(uiButton* btn, void* blarg) ApplyNewSettings(4); } +#ifdef JIT_ENABLED void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) @@ -80,6 +87,7 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg) else uiControlDisable(uiControl(enJITMaxBlockSize)); } +#endif void Open() { @@ -90,7 +98,7 @@ void Open() } opened = true; - win = uiNewWindow("Emu settings - melonDS", 300, 170, 0, 0, 0); + win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0); uiWindowSetMargined(win, 1); uiWindowOnClosing(win, OnCloseWindow, NULL); @@ -105,6 +113,7 @@ void Open() uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0); } +#ifdef JIT_ENABLED { uiLabel* dummy = uiNewLabel(""); uiBoxAppend(top, uiControl(dummy), 0); @@ -133,6 +142,12 @@ void Open() uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } } +#endif + + { + uiLabel* dummy = uiNewLabel(""); + uiBoxAppend(top, uiControl(dummy), 0); + } { uiBox* in_ctrl = uiNewHorizontalBox(); @@ -153,6 +168,7 @@ void Open() uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot); +#ifdef JIT_ENABLED uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable); { char maxBlockSizeStr[10]; @@ -160,6 +176,7 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); +#endif uiControlShow(uiControl(win)); } diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index d6809c3..af05d7a 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2411,8 +2411,10 @@ void ApplyNewSettings(int type) } else if (type == 4) { +#ifdef JIT_ENABLED if (Config::JIT_Enable) ARMJIT::InvalidateBlockCache(); +#endif } EmuRunning = prevstatus; -- cgit v1.2.3 From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:10:59 +0200 Subject: new block cache and much more... - more reliable code invalidation detection - blocks aren't stopped at any branch, but are being followed if possible to get larger blocks - idle loop recognition - optimised literal loads, load/store cycle counting and loads/stores from constant addresses --- src/ARM.cpp | 44 ++- src/ARM.h | 16 +- src/ARMInterpreter.h | 9 + src/ARMJIT.cpp | 755 ++++++++++++++++++++++++++++++------ src/ARMJIT.h | 141 ++----- src/ARMJIT_Internal.h | 198 ++++++++++ src/ARMJIT_RegisterCache.h | 36 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 16 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 184 +++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 51 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++---------------- src/ARM_InstrInfo.cpp | 47 ++- src/ARM_InstrInfo.h | 11 +- src/CP15.cpp | 12 +- src/Config.cpp | 2 + src/Config.h | 1 + src/NDS.cpp | 22 +- src/libui_sdl/DlgEmuSettings.cpp | 22 +- 19 files changed, 1550 insertions(+), 689 deletions(-) create mode 100644 src/ARMJIT_Internal.h (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index e404943..423c940 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) { NDS::ARM9Timestamp = NDS::ARM9Target; } break; } - if (IRQ) TriggerIRQ(); - - NDS::ARM9Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT() printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; // TODO optimize this shit!!! + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) { NDS::ARM7Timestamp = NDS::ARM7Target; } break; } - - if (IRQ) TriggerIRQ(); - - NDS::ARM7Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT() void ARMv5::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) @@ -758,6 +770,8 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { NextInstr[0] = CodeRead16(R[15] - 2); diff --git a/src/ARM.h b/src/ARM.h index 4d387bc..8a01068 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -299,7 +299,7 @@ public: { *val = NDS::ARM7Read8(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -308,7 +308,7 @@ public: *val = NDS::ARM7Read16(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -317,7 +317,7 @@ public: *val = NDS::ARM7Read32(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -325,14 +325,14 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -341,7 +341,7 @@ public: NDS::ARM7Write16(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -350,7 +350,7 @@ public: NDS::ARM7Write32(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -358,7 +358,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 85cadf3..686bdd6 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,122 +1,137 @@ #include "ARMJIT.h" #include +#include #include "Config.h" +#include "ARMJIT_Internal.h" #include "ARMJIT_x64/ARMJIT_Compiler.h" +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" + namespace ARMJIT { +#define JIT_DEBUGPRINT(msg, ...) + Compiler* compiler; -BlockCache cache; -#define DUP2(x) x, x +const u32 ExeMemRegionSizes[] = { + 0x8000, // Unmapped Region (dummy) + 0x8000, // ITCM + 4*1024*1024, // Main RAM + 0x8000, // SWRAM + 0xA4000, // LCDC + 0x8000, // ARM9 BIOS + 0x4000, // ARM7 BIOS + 0x10000, // ARM7 WRAM + 0x40000 // ARM7 WVRAM +}; -static ptrdiff_t JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), - /* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ DUP2(offsetof(BlockCache, SWRAM)), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ -1, - offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS)) - }, - //arm7 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)), - /* 1X*/ DUP2(-1), - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ offsetof(BlockCache, SWRAM), - offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(-1) - } +const u32 ExeMemRegionOffsets[] = { + 0, + 0x8000, + 0x10000, + 0x410000, + 0x418000, + 0x4BC000, + 0x4C4000, + 0x4C8000, + 0x4D8000, + 0x518000, }; -static u32 JIT_MASK[2][32] = { +#define DUP2(x) x, x + +const static ExeMemKind JIT_MEM[2][32] = { //arm9 { - /* 0X*/ DUP2(0x00007FFF), - /* 1X*/ DUP2(0x00007FFF), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ DUP2(0x00007FFF), - /* 4X*/ DUP2(0x00000000), - /* 5X*/ DUP2(0x00000000), - /* 6X*/ 0x00000000, - 0x000FFFFF, - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00007FFF) + /* 0X*/ DUP2(exeMem_ITCM), + /* 1X*/ DUP2(exeMem_ITCM), // mirror + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ DUP2(exeMem_SWRAM), + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ exeMem_Unmapped, + exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_ARM9_BIOS) }, //arm7 { - /* 0X*/ DUP2(0x00003FFF), - /* 1X*/ DUP2(0x00000000), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ 0x00007FFF, - 0x0000FFFF, - /* 4X*/ 0x00000000, - 0x0000FFFF, - /* 5X*/ DUP2(0x00000000), - /* 6X*/ DUP2(0x0003FFFF), - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00000000) + /* 0X*/ DUP2(exeMem_ARM7_BIOS), + /* 1X*/ DUP2(exeMem_Unmapped), + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ exeMem_SWRAM, + exeMem_ARM7_WRAM, + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, + DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_Unmapped) } }; #undef DUP2 +/* + translates address to pseudo physical address + - more compact, eliminates mirroring, everything comes in a row + - we only need one translation table +*/ +u32 AddrTranslate9[0x2000]; +u32 AddrTranslate7[0x4000]; -void Init() +JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; +AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +TinyVector JitBlocks; +JitBlock* RestoreCandidates[0x1000] = {NULL}; + +u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) { - memset(&cache, 0, sizeof(BlockCache)); + return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); +} +void Init() +{ for (int i = 0; i < 0x2000; i++) - cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) - + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + { + ExeMemKind kind = JIT_MEM[0][i >> 8]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); + } for (int i = 0; i < 0x4000; i++) - cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) - + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); + { + ExeMemKind kind = JIT_MEM[1][i >> 9]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); + } compiler = new Compiler(); } @@ -126,7 +141,7 @@ void DeInit() delete compiler; } -void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { @@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -CompiledBlock CompileBlock(ARM* cpu) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + } + else + { + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + NULL // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu) if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + if (!(cpu->Num == 0 + ? IsMapped<0>(blockAddr) + : IsMapped<1>(blockAddr))) + { + printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); + } + + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr<0>(blockAddr) + : TranslateAddr<1>(blockAddr); + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + + u32 addresseRanges[32] = {}; + u32 numAddressRanges = 0; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", + blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + + u32 lastSegmentStart = blockAddr; + do { r15 += thumb ? 2 : 4; + instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; @@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + u32 translatedAddr = (cpu->Num == 0 + ? TranslateAddr<0>(instrs[i].Addr) + : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addresseRanges[j] == translatedAddr) + { + returning = true; + break; + } + } + if (!returning) + addresseRanges[numAddressRanges++] = translatedAddr; + } if (cpu->Num == 0) { @@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i - 1].Info.EndBlock = true; i--; } - i++; + if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + u32 cond, target; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 offset = (target - blockAddr) / (thumb ? 2 : 4); + if (IsIdleLoop(instrs + offset, i - offset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + u32 targetPseudoPhysical = cpu->Num == 0 + ? TranslateAddr<0>(target) + : TranslateAddr<1>(target); + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); - if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) - floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); - floodFillSetFlags(instrs, i - 1, 0xF); + u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); + JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + bool mayRestore = true; + if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + { + RestoreCandidates[restoreSlot] = NULL; + if (prevBlock->NumInstrs == i) + { + for (int j = 0; j < i; j++) + { + if (prevBlock->Instrs()[j] != instrs[j].Instr) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; - CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); + if (prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } - if (cpu->Num == 0) - InsertBlock<0>(blockAddr, block); + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(i, numAddressRanges); + for (int j = 0; j < i; j++) + block->Instrs()[j] = instrs[j].Instr; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addresseRanges[j]; + + block->StartAddr = blockAddr; + block->PseudoPhysicalAddr = pseudoPhysicalAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + } else - InsertBlock<1>(blockAddr, block); + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addresseRanges[j] == block->AddressRanges()[j]); + CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + } + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - return block; + JitBlocks.Add(block); } -void InvalidateBlockCache() +void InvalidateByAddr(u32 pseudoPhysical) { - printf("Resetting JIT block cache...\n"); + JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + int startLength = range->Blocks.Length; + for (int i = 0; i < range->Blocks.Length; i++) + { + assert(range->Blocks.Length == startLength); + JitBlock* block = range->Blocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 256) != (pseudoPhysical / 256)) + { + AddressRange* otherRange = &CodeRanges[addr / 256]; + assert(otherRange != range); + assert(otherRange->Blocks.RemoveByValue(block)); + } + } + + assert(JitBlocks.RemoveByValue(block)); + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + if ((range->TimesInvalidated + 1) > range->TimesInvalidated) + range->TimesInvalidated++; + + range->Blocks.Clear(); +} + +void InvalidateByAddr7(u32 addr) +{ + u32 pseudoPhysical = TranslateAddr<1>(addr); + if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateITCM(u32 addr) +{ + u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; + if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateAll() +{ + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeRanges[addr / 256]; + range->Blocks.Clear(); + if (range->TimesInvalidated + 1 > range->TimesInvalidated) + range->TimesInvalidated++; + } + + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + + JitBlocks.Clear(); +} + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); + for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + { + if (RestoreCandidates[i]) + { + delete RestoreCandidates[i]; + RestoreCandidates[i] = NULL; + } + } + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 256].Blocks.Clear(); + CodeRanges[addr / 256].TimesInvalidated = 0; + } + delete block; + } + JitBlocks.Clear(); compiler->Reset(); } +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + if ((addr & 0xFF000000) == 0x04000000) + { + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size == 16) + { + if (store) + return (void*)Wifi::Write; + else + return (void*)Wifi::Read; + } + break; + } + } + return NULL; +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 7e448ef..1db4d66 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,142 +9,67 @@ namespace ARMJIT { -typedef u32 (*CompiledBlock)(); - -struct FetchedInstr +enum ExeMemKind { - u32 A_Reg(int pos) const - { - return (Instr >> pos) & 0xF; - } - - u32 T_Reg(int pos) const - { - return (Instr >> pos) & 0x7; - } - - u32 Cond() const - { - return Instr >> 28; - } - - u8 SetFlags; - u32 Instr; - u32 NextInstr[2]; - u32 Addr; - - u8 CodeCycles; - - ARMInstrInfo::Info Info; + exeMem_Unmapped = 0, + exeMem_ITCM, + exeMem_MainRAM, + exeMem_SWRAM, + exeMem_LCDC, + exeMem_ARM9_BIOS, + exeMem_ARM7_BIOS, + exeMem_ARM7_WRAM, + exeMem_ARM7_WVRAM, + exeMem_Count }; -/* - Copied from DeSmuME - Some names where changed to match the nomenclature of melonDS +extern const u32 ExeMemRegionOffsets[]; +extern const u32 ExeMemRegionSizes[]; - Since it's nowhere explained and atleast I needed some time to get behind it, - here's a summary on how it works: - more or less all memory locations from which code can be executed are - represented by an array of function pointers, which point to null or - a function which executes a block instructions starting from there. +typedef u32 (*JitBlockEntry)(); - The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which - a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB - are the sizes of the smallest contigous memory region mapped to the respective CPU. - Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, - we only need every second half word to be adressable. +extern u32 AddrTranslate9[0x2000]; +extern u32 AddrTranslate7[0x4000]; - In case a memory write hits mapped memory, the function block at this - address is set to null, so it's recompiled the next time it's executed. - - This method has disadvantages, namely that only writing to the - first instruction of a block marks it as invalid and that memory remapping - (SWRAM and VRAM) isn't taken into account. -*/ - -struct BlockCache -{ - CompiledBlock* AddrMapping9[0x2000] = {0}; - CompiledBlock* AddrMapping7[0x4000] = {0}; - - CompiledBlock MainRAM[4*1024*1024/2]; - CompiledBlock SWRAM[0x8000/2]; // Shared working RAM - CompiledBlock ARM9_ITCM[0x8000/2]; - CompiledBlock ARM9_LCDC[0xA4000/2]; - CompiledBlock ARM9_BIOS[0x8000/2]; - CompiledBlock ARM7_BIOS[0x4000/2]; - CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM -}; - -extern BlockCache cache; +const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... +extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; } template -inline CompiledBlock LookUpBlock(u32 addr) +inline u32 TranslateAddr(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } template -inline void Invalidate16(u32 addr) +inline JitBlockEntry LookUpBlock(u32 addr) { - if (IsMapped(addr)) - { - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; - } -} - -template -inline void Invalidate32(u32 addr) -{ - if (IsMapped(addr)) - { - if (num == 0) - { - CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; - page[(addr & 0x7FFF) >> 1] = NULL; - page[((addr + 2) & 0x7FFF) >> 1] = NULL; - } - else - { - CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; - } - } -} - -template -inline void InsertBlock(u32 addr, CompiledBlock func) -{ - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + return FastBlockAccess[TranslateAddr(addr) / 2]; } void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateAll(); + +void InvalidateITCM(u32 addr); +void InvalidateByAddr7(u32 addr); + +void CompileBlock(ARM* cpu); -void InvalidateBlockCache(); +void ResetBlockCache(); } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..4acb488 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,198 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include + +#include "ARMJIT.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2 +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 NextInstr[2]; + u32 Addr; + + u8 CodeCycles; + u8 DataCycles; + u8 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u32 Length = 0; // make it 32 bit so we don't need movzx + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(Data) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 numInstrs, u32 numAddresses) + { + NumInstrs = numInstrs; + NumAddresses = numAddresses; + Data = new u32[numInstrs + numAddresses]; + } + + ~JitBlock() + { + delete[] Data; + } + + u32 StartAddr; + u32 PseudoPhysicalAddr; + + u32 NumInstrs; + u32 NumAddresses; + + JitBlockEntry EntryPoint; + + u32* Instrs() + { return Data; } + u32* AddressRanges() + { return Data + NumInstrs; } + +private: + /* + 0.. Blocks; + u16 TimesInvalidated; +}; + +extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index fe2f203..ed6a2b7 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -60,15 +60,46 @@ public: assert("Welp!"); } + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + void Flush() { BitSet16 loadedSet(LoadedRegs); for (int reg : loadedSet) UnloadRegister(reg); + LiteralsLoaded = 0; } void Prepare(bool thumb, int i) { + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + u16 futureNeeded = 0; int ranking[16]; for (int j = 0; j < 16; j++) @@ -86,7 +117,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; + FetchedInstr Instr = Instrs[i]; u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -125,6 +156,9 @@ public: static const int NativeRegsAvailable; Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; u32 NativeRegsUsed = 0; u16 LoadedRegs = 0; u16 DirtyRegs = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f868ddf..14c223b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp() MOV(32, rd, op2); if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); if (S) { @@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); - if (op & 1) + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); @@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU() u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) - Comp_AddCycles_CI(1); + Comp_AddCycles_CI(1); // shift by reg else Comp_AddCycles_C(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cc7a3c4..0dedb3f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -16,9 +16,6 @@ int squeezePointer(T* ptr) void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { // we can simplify constant branches by a lot - // it's not completely safe to assume stuff like, which instructions to preload - // we'll see how it works out - IrregularCycles = true; u32 newPC; @@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { ARMv5* cpu9 = (ARMv5*)CurCPU; - u32 oldregion = R15 >> 24; - u32 newregion = addr >> 24; - u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; - MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - - bool setupRegion = newregion != oldregion; - if (setupRegion) - cpu9->SetupCodeMem(addr); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); if (addr & 0x1) { @@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cycles += cpu9->CodeCycles; } - MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); - cpu9->RegionCodeCycles = compileTimeCodeCycles; - if (setupRegion) - cpu9->SetupCodeMem(R15); } else { @@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeRegion = codeRegion; cpu7->CodeCycles = codeCycles; - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } if (addr & 0x1) { @@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = addr >> 15; } - MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else @@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); + Comp_SpecialBranchBehaviour(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d8ce1aa..25c55a3 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -72,12 +72,15 @@ Compiler::Compiler() for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) - { MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); - MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); - } } + MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; + MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; + MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; + MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; + MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; + MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -179,12 +182,13 @@ void Compiler::LoadCPSR() MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); } -void Compiler::SaveCPSR() +void Compiler::SaveCPSR(bool flagClean) { if (CPSRDirty) { MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); - CPSRDirty = false; + if (flagClean) + CPSRDirty = false; } } @@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) // invalidates RSCRATCH and RSCRATCH3 Gen::FixupBranch Compiler::CheckCondition(u32 cond) { + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); if (cond >= 0x8) { static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); @@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - return J_CC(CC_Z); + return J_CC(CC_Z, ldmStm); } else { // could have used a LUT, but then where would be the fun? TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - return J_CC(cond & 1 ? CC_NZ : CC_Z); + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); } } @@ -354,25 +361,34 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +void Compiler::Comp_SpecialBranchBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... - InvalidateBlockCache(); + ResetBlockCache(); ConstantCycles = 0; - Thumb = cpu->CPSR & 0x20; + Thumb = thumb; Num = cpu->Num; - CodeRegion = cpu->CodeRegion; + CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - - if (!(Num == 0 - ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) - : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) - { - printf("Trying to compile a block in unmapped memory\n"); - } + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); - // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; - if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + SaveCPSR(); + } } - + if (comp != NULL) RegCache.Prepare(Thumb, i); else @@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); } else (this->*comp)(); @@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } } else if (cond == 0xF) + { Comp_AddCycles_C(); + } else { IrregularCycles = false; @@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (cond < 0xE) skipExecute = CheckCondition(cond); - u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); } else (this->*comp)(); + Comp_SpecialBranchBehaviour(); + if (CurInstr.Cond() < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + SetJumpTarget(skipFailed); } else @@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + return res; } @@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } } +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index fcb2380..792ff66 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,6 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" namespace ARMJIT @@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +struct ComplexOperand +{ + ComplexOperand() + {} + + ComplexOperand(u32 imm) + : IsImm(true), Imm(imm) + {} + ComplexOperand(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; class Compiler : public Gen::XEmitter { @@ -24,7 +51,7 @@ public: void Reset(); - CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -39,6 +66,8 @@ public: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); enum { @@ -92,8 +121,17 @@ public: void T_Comp_BL_LONG_2(); void T_Comp_BL_Merged(); - void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + void Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -105,8 +143,9 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void Comp_SpecialBranchBehaviour(); + void* Gen_MemoryRoutine9(bool store, int size); - void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); void* Gen_MemoryRoutineSeq9(bool store, bool preinc); void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); @@ -117,10 +156,9 @@ public: Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); - Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); - void SaveCPSR(); + void SaveCPSR(bool flagClean = true); bool FlagsNZRequired() { return CurInstr.SetFlags & 0xC; } @@ -139,10 +177,11 @@ public: u8* ResetStart; u32 CodeMemSize; + bool Exit; bool IrregularCycles; void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2][2]; + void* MemoryFuncs7[3][2]; void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index bf8280d..13ca415 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -27,51 +27,7 @@ int squeezePointer(T* ptr) /* address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) - code cycles - ABI_PARAM3 */ - -#define CALC_CYCLES_9(numC, numD, scratch) \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ - CMP(32, R(numC), R(numD)); \ - CMOVcc(32, numD, R(numC), CC_G); \ - CMP(32, R(numD), R(scratch)); \ - CMOVcc(32, scratch, R(numD), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); -#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - LEA(32, scratch, MRegSum(numD, numC)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - if (!store) \ - ADD(32, R(numC), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } -#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - if (!store) \ - ADD(32, R(numD), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } - void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); FixupBranch insideITCM = J_CC(CC_B); - // cycle counting! - MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); - SHR(32, R(ABI_PARAM4), Imm8(12)); - MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); - CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) { if (size > 8) @@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) { MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + + // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! + static_assert(sizeof(AddressRange) == 16); + LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + JMP((u8*)InvalidateByAddr, true); + SetJumpTarget(noCode); } else { @@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) -{ - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); - AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); - if (store) - { - MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); - } - else - { - MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); - - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM7Write32, true); break; - case 16: JMP((u8*)NDS::ARM7Write16, true); break; - case 8: JMP((u8*)NDS::ARM7Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM7Read16, true); - } - else - JMP((u8*)NDS::ARM7Read8, true); - } - - return res; -} - #define MEMORY_SEQ_WHILE_COND \ if (!store) \ MOV(32, currentElement, R(EAX));\ @@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) ABI_PARAM1 address ABI_PARAM2 address where registers are stored ABI_PARAM3 how many values to read/write - ABI_PARAM4 code cycles Dolphin x64CodeEmitter is my favourite assembler */ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); - - FixupBranch finishIt1 = J(); + RET(); SetJumpTarget(insideDTCM); AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); @@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time - MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential - FixupBranch finishIt2 = J(); + RET(); SetJumpTarget(insideITCM); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); @@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { MOV(32, R(ABI_PARAM4), currentElement); MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + + ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(ABI_PARAM4), R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); + CALL((u8*)InvalidateByAddr); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + SetJumpTarget(noCode); } else MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); - MOV(32, R(ABI_PARAM2), Imm32(1)); - - SetJumpTarget(finishIt1); - SetJumpTarget(finishIt2); - - POP(ABI_PARAM4); - POP(ABI_PARAM3); - - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); - - CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) RET(); return res; @@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + RET(); - POP(ABI_PARAM4); - POP(ABI_PARAM3); + return res; +} - // TODO: optimise this - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); +#undef MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); +void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + CurCPU->DataRead16(addr & ~0x1, &val); + else + CurCPU->DataRead8(addr, &val); + CurCPU->R[15] = tmpR15; - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); + MOV(32, MapReg(rd), Imm32(val)); - return res; + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + Comp_AddCycles_CDI(); } -#undef CALC_CYCLES_9 -#undef MEMORY_SEQ_WHILE_COND +void fault(u32 a, u32 b) +{ + printf("actually not static! %x %x\n", a, b); +} -void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) +void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - IrregularCycles = true; + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - if (store) - MOV(32, R(ABI_PARAM2), rd); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - MOV(32, R(ABI_PARAM3), Imm32(cycles)); - CALL(Num == 0 - ? MemoryFuncs9[size >> 4][store] - : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; - if (!store) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { - if (signExtend) - MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + Comp_MemLoadLiteral(size, rd, + R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + } + else + { + OpArg rdMapped = MapReg(rd); + OpArg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memoryFunc = Num == 0 + ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] + : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; + + if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); + MOV(32, R(ABI_PARAM1), Imm32(R15)); + MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + CMP(32, R(RSCRATCH), Imm32(addr)); + FixupBranch eq = J_CC(CC_E); + CALL((void*)fault); + SetJumpTarget(eq);*/ + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + if (flags & memop_Store) + { + MOV(size, M(ptr), MapReg(rd)); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + + if (size == 32 && addr & ~0x3) + { + ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + } + } + + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memoryFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + X64Reg finalAddr = ABI_PARAM1; + if (flags & memop_Post) + { + MOV(32, R(ABI_PARAM1), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } else - MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + if (flags & memop_Store) + MOV(32, R(ABI_PARAM2), rdMapped); + + if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) + MOV(32, rdMapped, R(ABI_PARAM1)); + + if (inlinePreparation && size > 8) + AND(32, R(ABI_PARAM1), Imm8(addressMask)); + + CALL(memoryFunc); + + if (!(flags & memop_Store)) + { + if (inlinePreparation && size == 32) + { + if (constLocalROR32 == 4) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rdMapped); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else if (constLocalROR32 != 0) + ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); + } + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } } } @@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - // we need to make sure that the stack stays aligned to 16 bytes u32 stackAlloc = ((regsCount + 1) & ~1) * 8; - MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + Comp_AddCycles_CDI(); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + Comp_AddCycles_CD(); + if (regsCount & 1) PUSH(RSCRATCH); @@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc return offset; } -OpArg Compiler::A_Comp_GetMemWBOffset() -{ - if (!(CurInstr.Instr & (1 << 25))) - { - u32 imm = CurInstr.Instr & 0xFFF; - return Imm32(imm); - } - else - { - int op = (CurInstr.Instr >> 5) & 0x3; - int amount = (CurInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurInstr.A_Reg(0)); - bool carryUsed; - - return Comp_RegShiftImm(op, amount, rm, false, carryUsed); - } -} void Compiler::A_Comp_MemWB() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); bool load = CurInstr.Instr & (1 << 20); bool byte = CurInstr.Instr & (1 << 22); int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; - if (CurInstr.Instr & (1 << 24)) + ComplexOperand offset; + if (!(CurInstr.Instr & (1 << 25))) { - OpArg offset = A_Comp_GetMemWBOffset(); - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); + offset = ComplexOperand(CurInstr.Instr & 0xFFF); } else - MOV(32, R(ABI_PARAM1), rn); - - if (!(CurInstr.Instr & (1 << 24))) { - OpArg offset = A_Comp_GetMemWBOffset(); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); + offset = ComplexOperand(rm, op, amount); } - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); - if (load && CurInstr.A_Reg(12) == 15) - { - if (byte) - printf("!!! LDRB PC %08X\n", R15); - else - { - if (Num == 1) - AND(32, rd, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rd.GetSimpleReg()); - } - } + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::A_Comp_MemHalf() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); - - OpArg offset = CurInstr.Instr & (1 << 22) - ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : MapReg(CurInstr.A_Reg(0)); + ComplexOperand offset = CurInstr.Instr & (1 << 22) + ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : ComplexOperand(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf() if (size == 32 && Num == 1) return; // NOP - if (CurInstr.Instr & (1 << 24)) - { - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); - } - else - MOV(32, R(ABI_PARAM1), rn); - + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; if (!(CurInstr.Instr & (1 << 24))) - { - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); - } + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; - Comp_MemAccess(rd, signExtend, !load, size); - - if (load && CurInstr.A_Reg(12) == 15) - printf("!!! MemHalf op PC %08X\n", R15);; + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; bool byte = op & 0x1; - MOV_sum(32, ABI_PARAM1, rb, ro); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::A_Comp_LDM_STM() @@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM() void Compiler::T_Comp_MemImm() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::T_Comp_MemRegHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op != 0; int size = op != 1 ? 16 : 8; bool signExtend = op & 1; - MOV_sum(32, ABI_PARAM1, rb, ro); + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; - Comp_MemAccess(rd, signExtend, !load, size); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + size, flags); } void Compiler::T_Comp_MemImmHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 16); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + load ? 0 : memop_Store); } void Compiler::T_Comp_LoadPCRel() { - OpArg rd = MapReg(CurInstr.T_Reg(8)); u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - // hopefully this doesn't break - u32 val; CurCPU->DataRead32(addr, &val); - MOV(32, rd, Imm32(val)); + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); } void Compiler::T_Comp_MemSPRel() { u32 offset = (CurInstr.Instr & 0xFF) * 4; - OpArg rd = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 32); + Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + load ? 0 : memop_Store); } void Compiler::T_Comp_PUSH_POP() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 9239e29..0fbde26 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -36,7 +36,7 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMemory = 1 << 20, + A_WriteMem = 1 << 20 }; #define A_BIOP A_Read16 @@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak( const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); -const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); @@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 | A_WriteMemory +#define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double | A_WriteMemory +#define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 21) +#define tk(x) ((x) << 22) enum { T_Read0 = 1 << 0, @@ -210,6 +210,8 @@ enum { T_SetMaybeC = 1 << 18, T_ReadC = 1 << 19, T_SetC = 1 << 20, + + T_WriteMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); -const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG); -const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG); +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); -const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); -const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM); +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); -const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); -const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); -const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); -const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); @@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 21) & 0x3F; + res.Kind = (data >> 22) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_SetC) res.WriteFlags |= flag_C; + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 id = (cn<<8)|(cm<<4)|cpinfo; if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) { @@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) res.WriteFlags |= flag_C; + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d01c600..d02f168 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -226,18 +226,27 @@ enum flag_V = 1 << 0, }; +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_WaitForInterrupt +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 SpecialKind; + u8 ReadFlags; // lower 4 bits - set always // upper 4 bits - might set flag u8 WriteFlags; bool EndBlock; - bool Branches() + bool Branches() const { return DstRegs & (1 << 15); } diff --git a/src/CP15.cpp b/src/CP15.cpp index e6e91c3..10c3b1b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + ARMJIT::InvalidateAll(); ICacheInvalidateAll(); return; case 0x751: + ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); ICacheInvalidateByAddr(val); return; case 0x752: @@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } diff --git a/src/Config.cpp b/src/Config.cpp index 3cff0ed..63d61a3 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,7 @@ int GL_Antialias; #ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +bool JIT_BrancheOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c13eae3..0fcefc3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -49,6 +49,7 @@ extern int GL_Antialias; #ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +extern bool JIT_BrancheOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 1baa308..e9e6795 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -536,7 +536,7 @@ void Reset() RCnt = 0; #ifdef JIT_ENABLED - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); #endif NDSCart::Reset(); @@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file) #ifdef JIT_ENABLED if (!file->Saving) { - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); } #endif @@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate32<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 09ea8eb..45e8e0c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot; #ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +uiCheckbox* cbJITBranchOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg) bool enableJit = uiCheckboxChecked(cbJITEnabled); char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); + bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || + branchOptimisations != Config::JIT_BrancheOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; + Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); restart = true; } @@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg) void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) + { uiControlEnable(uiControl(enJITMaxBlockSize)); + uiControlEnable(uiControl(cbJITBranchOptimisations)); + } else + { uiControlDisable(uiControl(enJITMaxBlockSize)); + uiControlDisable(uiControl(cbJITBranchOptimisations)); + } } #endif @@ -159,6 +169,14 @@ void Open() enJITMaxBlockSize = uiNewEntry(); uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); + } } #endif @@ -194,6 +212,8 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); + + uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); #endif uiControlShow(uiControl(win)); -- cgit v1.2.3 From 81f38c14be0d9ba5a3da8f67d9719ed2c47279c5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 18 Oct 2019 13:29:17 +0200 Subject: integrate changes from ARM64 backend and more - better handle LDM/STM in reg alloc - unify Halted and IRQ in anticipation for branch inlining - literal optimisations can be disabled in gui - jit blocks follow simple returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in Pokemon White) --- src/ARM.cpp | 40 ++++++++++++++++++----------- src/ARM.h | 13 +++++++--- src/ARMJIT.cpp | 50 +++++++++++++++++++++++++++++++------ src/ARMJIT_RegisterCache.h | 33 +++++++++++++++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 7 +++--- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++---- src/ARM_InstrInfo.cpp | 28 +++++++++++++++++++++ src/ARM_InstrInfo.h | 2 +- src/Config.cpp | 2 ++ src/Config.h | 1 + src/NDS.cpp | 2 +- src/libui_sdl/DlgEmuSettings.cpp | 31 ++++++++++++++++++++--- src/libui_sdl/main.cpp | 2 -- 13 files changed, 179 insertions(+), 48 deletions(-) (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 423c940..4fab60e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -113,7 +113,7 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + file->Var32(&StopExecution); file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -589,16 +589,21 @@ void ARMv5::ExecuteJIT() NDS::ARM9Timestamp += Cycles; Cycles = 0; - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM9Timestamp = NDS::ARM9Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; } - break; } } @@ -726,16 +731,21 @@ void ARMv4::ExecuteJIT() Cycles = 0; // TODO optimize this shit!!! - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM7Timestamp = NDS::ARM7Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; } - break; } } diff --git a/src/ARM.h b/src/ARM.h index 8a01068..e252d23 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -112,9 +112,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 19a5e70..0695b85 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -16,11 +16,13 @@ #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" +#include "NDSCart.h" namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* compiler; @@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); @@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } else { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) @@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } return false; } @@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu) CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; do { @@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; + if (instrs[i].Info.DstRegs & (1 << 14)) + hasLink = false; + if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); @@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu) { bool hasBranched = cpu->R[15] != r15; - u32 cond, target; - bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) @@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu) if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop - u32 offset = (target - blockAddr) / (thumb ? 2 : 4); - if (IsIdleLoop(instrs + offset, i - offset + 1)) + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } - else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 ? TranslateAddr<0>(target) : TranslateAddr<1>(target); + + if (link) + { + lr = linkAddr; + hasLink = true; + } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); @@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu) bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); JitBlock* prevBlock = RestoreCandidates[restoreSlot]; @@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if ((addr & 0xFF000000) == 0x04000000) { + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + /* unfortunately we can't map GPU2D this way since it's hidden inside an object diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index ed6a2b7..2222bc2 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -93,10 +93,12 @@ public: void Prepare(bool thumb, int i) { + FetchedInstr instr = Instrs[i]; + if (LoadedRegs & (1 << 15)) UnloadRegister(15); - BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); for (int reg : invalidedLiterals) UnloadLiteral(reg); @@ -108,6 +110,7 @@ public: { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); for (int reg : regsNeeded) ranking[reg]++; } @@ -117,8 +120,8 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -143,13 +146,31 @@ public: loadedSet.m_val = LoadedRegs; } + // we don't need to load a value which is always going to be overwritten BitSet16 needValueLoaded(needToBeLoaded); - if (thumb || Instr.Cond() >= 0xE) - needValueLoaded = BitSet16(Instr.Info.SrcRegs); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + writeRegs |= (1 << reg) & instr.Info.DstRegs; + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + + DirtyRegs |= writeRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index a994d34..fd38724 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -364,7 +364,7 @@ void Compiler::Reset() void Compiler::Comp_SpecialBranchBehaviour() { if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); if (CurInstr.BranchFlags & branch_FollowCondNotTaken) { @@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] { CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); @@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { - IrregularCycles = true; - s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; @@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD() IrregularCycles = true; } - if (!Thumb && CurInstr.Cond() < 0xE) + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index eb01c87..3799774 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,5 +1,6 @@ #include "ARMJIT_Compiler.h" +#include "../Config.h" using namespace Gen; @@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); Comp_MemLoadLiteral(size, rd, addr); @@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); bool inlinePreparation = Num == 1; u32 constLocalROR32 = 4; @@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) { u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); @@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (Config::JIT_LiteralOptimisations) + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + else + Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 1261bbe..8f8bd35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + res.NotStrictlyNeeded |= set; + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set; + res.SrcRegs |= set; + } + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) res.SpecialKind = special_LoadLiteral; + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + res.NotStrictlyNeeded |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + res.NotStrictlyNeeded |= set; + } if ((instr >> 28) < 0xE) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index c032a4f..2732181 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -236,7 +236,7 @@ enum struct Info { - u16 DstRegs, SrcRegs; + u16 DstRegs, SrcRegs, NotStrictlyNeeded; u16 Kind; u8 SpecialKind; diff --git a/src/Config.cpp b/src/Config.cpp index 63d61a3..eb5bfcc 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -38,6 +38,7 @@ int GL_Antialias; bool JIT_Enable = false; int JIT_MaxBlockSize = 12; bool JIT_BrancheOptimisations = true; +bool JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -52,6 +53,7 @@ ConfigEntry ConfigFile[] = {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 0fcefc3..723ab13 100644 --- a/src/Config.h +++ b/src/Config.h @@ -50,6 +50,7 @@ extern int GL_Antialias; extern bool JIT_Enable; extern int JIT_MaxBlockSize; extern bool JIT_BrancheOptimisations; +extern bool JIT_LiteralOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index e9e6795..141c565 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1142,7 +1142,7 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); } else { diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 45e8e0c..0df9c6c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -43,6 +43,7 @@ uiCheckbox* cbDirectBoot; uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; uiCheckbox* cbJITBranchOptimisations; +uiCheckbox* cbJITLiteralOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -66,14 +67,16 @@ void OnOk(uiButton* btn, void* blarg) char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || - branchOptimisations != Config::JIT_BrancheOptimisations) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize + || branchOptimisations != Config::JIT_BrancheOptimisations + || literalOptimisations != Config::JIT_LiteralOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -82,7 +85,8 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; - Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + Config::JIT_BrancheOptimisations = branchOptimisations; + Config::JIT_LiteralOptimisations = literalOptimisations; restart = true; } @@ -108,11 +112,13 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg) { uiControlEnable(uiControl(enJITMaxBlockSize)); uiControlEnable(uiControl(cbJITBranchOptimisations)); + uiControlEnable(uiControl(cbJITLiteralOptimisations)); } else { uiControlDisable(uiControl(enJITMaxBlockSize)); uiControlDisable(uiControl(cbJITBranchOptimisations)); + uiControlDisable(uiControl(cbJITLiteralOptimisations)); } } #endif @@ -174,9 +180,25 @@ void Open() uiBox* row = uiNewHorizontalBox(); uiBoxAppend(in_ctrl, uiControl(row), 0); - cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:"); + uiBoxAppend(row, uiControl(lbl), 0); + } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations"); uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations"); + uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0); + } } #endif @@ -214,6 +236,7 @@ void Open() OnJITStateChanged(cbJITEnabled, NULL); uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); + uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations); #endif uiControlShow(uiControl(win)); diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index c3db88d..0066668 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { - freopen("miauz.txt", "w", stdout); - srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From 3787bab1f69ae22d3e8106d70598ce923e5efe70 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 13:40:51 +0200 Subject: implement block linking + some refactoring currently only supported for x64 --- .gitignore | 2 + src/ARM.cpp | 37 +- src/ARM.h | 32 +- src/ARMJIT.cpp | 223 +++- src/ARMJIT.h | 10 +- src/ARMJIT_Internal.h | 24 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 23 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 140 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_GenOffsets.cpp | 15 + src/ARMJIT_x64/ARMJIT_Linkage.s | 74 ++ src/ARMJIT_x64/ARMJIT_Offsets.h | 3 + src/CMakeLists.txt | 7 + src/Config.cpp | 8 +- src/Config.h | 6 +- src/xxhash/xxh3.h | 2390 ++++++++++++++++++++++++++++++++++ src/xxhash/xxhash.c | 43 + src/xxhash/xxhash.h | 1965 ++++++++++++++++++++++++++++ 18 files changed, 4871 insertions(+), 150 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h create mode 100644 src/xxhash/xxh3.h create mode 100644 src/xxhash/xxhash.c create mode 100644 src/xxhash/xxhash.h (limited to 'src/Config.h') diff --git a/.gitignore b/.gitignore index dd81614..3c87740 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ melon_grc.h cmake-build cmake-build-debug .idea + +*.exe diff --git a/src/ARM.cpp b/src/ARM.cpp index 9ab9546..32cb91c 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -206,15 +206,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x2) { NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; } else { NextInstr[0] = CodeRead32(addr, true); NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; } CPSR |= 0x20; @@ -227,9 +227,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; CPSR &= ~0x20; } @@ -272,7 +272,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead16(addr); NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; } @@ -285,7 +285,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead32(addr); NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; } @@ -544,7 +544,7 @@ void ARMv5::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM9Timestamp += Cycles; + NDS::ARM9Timestamp -= Cycles; Cycles = 0; } @@ -584,14 +584,16 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp += Cycles; - Cycles = 0; + NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); if (StopExecution) { @@ -685,7 +687,7 @@ void ARMv4::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM7Timestamp += Cycles; + NDS::ARM7Timestamp -= Cycles; Cycles = 0; } @@ -725,14 +727,15 @@ void ARMv4::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp += Cycles; - Cycles = 0; + NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); // TODO optimize this shit!!! if (StopExecution) diff --git a/src/ARM.h b/src/ARM.h index 7767095..4877956 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -185,14 +185,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; + Cycles -= numC; } void AddCycles_CI(s32 numI) { // code+internal s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + Cycles -= numC + numI; } void AddCycles_CDI() @@ -203,9 +203,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void AddCycles_CD() @@ -215,9 +215,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void GetCodeMemRegion(u32 addr, NDS::MemRegion* region); @@ -375,13 +375,13 @@ public: void AddCycles_C() { // code only. this code fetch is sequential. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; } void AddCycles_CI(s32 num) { // code+internal. results in a nonseq code fetch. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; } void AddCycles_CDI() @@ -393,21 +393,21 @@ public: if ((DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else { numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } } else if (CodeRegion == 0x02) { numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD + 1; + Cycles -= numC + numD + 1; } } @@ -420,17 +420,17 @@ public: if ((DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else if (CodeRegion == 0x02) { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD; + Cycles -= numC + numD; } } }; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 208801e..cc8d4ce 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -2,6 +2,10 @@ #include #include +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" #include "Config.h" @@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = { u32 AddrTranslate9[0x2000]; u32 AddrTranslate7[0x4000]; -JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; AddressRange CodeRanges[ExeMemSpaceSize / 512]; -TinyVector JitBlocks; -JitBlock* RestoreCandidates[0x1000] = {NULL}; +std::unordered_map JitBlocks; -u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) +template +struct UnreliableHashTable { - return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); -} + struct Bucket + { + K KeyA, KeyB; + V ValA, ValB; + }; + + Bucket Table[Size]; + + void Reset() + { + for (int i = 0; i < Size; i++) + { + Table[i].ValA = Table[i].ValB = InvalidValue; + } + } + + UnreliableHashTable() + { + Reset(); + } + + V Insert(K key, V value) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA == value || bucket->ValB == value) + { + return InvalidValue; + } + else if (bucket->ValA == InvalidValue) + { + bucket->KeyA = key; + bucket->ValA = value; + } + else if (bucket->ValB == InvalidValue) + { + bucket->KeyB = key; + bucket->ValB = value; + } + else + { + V prevVal = bucket->ValB; + bucket->KeyB = bucket->KeyA; + bucket->ValB = bucket->ValA; + bucket->KeyA = key; + bucket->ValA = value; + return prevVal; + } + + return InvalidValue; + } + + void Remove(K key) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->KeyA == key && bucket->ValA != InvalidValue) + { + bucket->ValA = InvalidValue; + if (bucket->ValB != InvalidValue) + { + bucket->KeyA = bucket->KeyB; + bucket->ValA = bucket->ValB; + bucket->ValB = InvalidValue; + } + } + if (bucket->KeyB == key && bucket->ValB != InvalidValue) + bucket->ValB = InvalidValue; + } + + V LookUp(K addr) + { + u32 slot = XXH3_64bits(&addr, 4) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA != InvalidValue && bucket->KeyA == addr) + return bucket->ValA; + if (bucket->ValB != InvalidValue && bucket->KeyB == addr) + return bucket->ValB; + + return InvalidValue; + } +}; + +UnreliableHashTable RestoreCandidates; +UnreliableHashTable FastBlockLookUp; void Init() { @@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], - cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", + blockAddr, cpu->CPSR, pseudoPhysicalAddr, CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; @@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu) if (staticBranch) { + instrs[i].BranchFlags |= branch_StaticTarget; + bool isBackJump = false; if (hasBranched) { @@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); - u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); - JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; - if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + if (prevBlock) { - RestoreCandidates[restoreSlot] = NULL; + RestoreCandidates.Remove(pseudoPhysicalAddr); if (prevBlock->NumInstrs == i) { for (int j = 0; j < i; j++) @@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu) CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); } - FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - - JitBlocks.Add(block); + JitBlocks[pseudoPhysicalAddr] = block; + FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); } void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) @@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) } } - bool removed = JitBlocks.RemoveByValue(block); - assert(removed); + for (int j = 0; j < block->NumLinks(); j++) + compiler->UnlinkBlock(block->Links()[j]); - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + JitBlocks.erase(block->PseudoPhysicalAddr); + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); if (mayRestore) { - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } } if ((range->TimesInvalidated + 1) > range->TimesInvalidated) @@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr) void InvalidateAll() { JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - - for (int j = 0; j < block->NumAddresses; j++) + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + + for (int i = 0; i < block->NumAddresses; i++) { - u32 addr = block->AddressRanges()[j]; + u32 addr = block->AddressRanges()[i]; AddressRange* range = &CodeRanges[addr / 512]; range->Blocks.Clear(); if (range->TimesInvalidated + 1 > range->TimesInvalidated) range->TimesInvalidated++; } + for (int i = 0; i < block->NumLinks(); i++) + compiler->UnlinkBlock(block->Links()[i]); + block->ResetLinks(); - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } - JitBlocks.Clear(); + JitBlocks.clear(); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - - memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); - for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + + FastBlockLookUp.Reset(); + RestoreCandidates.Reset(); + for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { - if (RestoreCandidates[i]) + if (RestoreCandidates.Table[i].ValA) { - delete RestoreCandidates[i]; - RestoreCandidates[i] = NULL; + delete RestoreCandidates.Table[i].ValA; + RestoreCandidates.Table[i].ValA = NULL; + } + if (RestoreCandidates.Table[i].ValA) + { + delete RestoreCandidates.Table[i].ValB; + RestoreCandidates.Table[i].ValB = NULL; } } - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -788,11 +882,43 @@ void ResetBlockCache() } delete block; } - JitBlocks.Clear(); + JitBlocks.clear(); compiler->Reset(); } +JitBlockEntry LookUpBlockEntry(u32 addr) +{ + u32 entryOffset = FastBlockLookUp.LookUp(addr); + if (entryOffset != UINT32_MAX) + return compiler->AddEntryOffset(entryOffset); + + auto block = JitBlocks.find(addr); + if (block != JitBlocks.end()) + { + FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + return block->second->EntryPoint; + } + return NULL; +} + +template +void LinkBlock(ARM* cpu, u32 codeOffset) +{ + u32 targetPseudoPhys = TranslateAddr(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); + auto block = JitBlocks.find(targetPseudoPhys); + if (block == JitBlocks.end()) + { + CompileBlock(cpu); + block = JitBlocks.find(targetPseudoPhys); + } + + JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); + + block->second->AddLink(codeOffset); + compiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) @@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) return NULL; } -} \ No newline at end of file +} + +template void ARMJIT::LinkBlock<0>(ARM*, u32); +template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 09cc463..cab385f 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000]; extern u32 AddrTranslate7[0x4000]; const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) @@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr) return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } -template -inline JitBlockEntry LookUpBlock(u32 addr) -{ - return FastBlockAccess[TranslateAddr(addr) / 2]; -} +JitBlockEntry LookUpBlockEntry(u32 addr); + void Init(); void DeInit(); @@ -73,4 +69,6 @@ void ResetBlockCache(); } +extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); + #endif \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 0d6add9..66d1808 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -15,7 +15,8 @@ enum { branch_IdleBranch = 1 << 0, branch_FollowCondTaken = 1 << 1, - branch_FollowCondNotTaken = 1 << 2 + branch_FollowCondNotTaken = 1 << 2, + branch_StaticTarget = 1 << 3, }; struct FetchedInstr @@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector assert(capacity > Capacity); T* newMem = new T[capacity]; if (Data != NULL) - memcpy(newMem, Data, sizeof(Data) * Length); + memcpy(newMem, Data, sizeof(T) * Length); T* oldData = Data; Data = newMem; @@ -163,7 +164,6 @@ public: u32 NumInstrs; u32 NumAddresses; - u32 NumLinks; JitBlockEntry EntryPoint; @@ -171,6 +171,21 @@ public: { return &Data[0]; } u32* AddressRanges() { return &Data[NumInstrs]; } + u32* Links() + { return &Data[NumInstrs + NumAddresses]; } + + u32 NumLinks() + { return Data.Length - NumInstrs - NumAddresses; } + + void AddLink(u32 link) + { + Data.Add(link); + } + + void ResetLinks() + { + Data.SetLength(NumInstrs + NumAddresses); + } private: /* @@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000]; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); +template +void LinkBlock(ARM* cpu, u32 codeOffset); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index e02865d..cac590a 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) @@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) IrregularCycles = true; BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); - bool previouslyDirty = CPSRDirty; + bool cpsrDirty = CPSRDirty; SaveCPSR(); if (restoreCPSR) @@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) LoadReg(reg, RegCache.Mapping[reg]); } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + LoadCPSR(); + // in case this instruction is skipped + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } void Compiler::A_Comp_BranchImm() @@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); Comp_AddCycles_C(true); SetJumpTarget(skipFailed); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d69bdff..be3709e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -1,6 +1,7 @@ #include "ARMJIT_Compiler.h" #include "../ARMInterpreter.h" +#include "../Config.h" #include @@ -15,6 +16,8 @@ using namespace Gen; +extern "C" void ARM_Ret(); + namespace ARMJIT { template <> @@ -170,6 +173,24 @@ Compiler::Compiler() RET(); } + { + CPSRDirty = true; + BranchStub[0] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<0>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + + CPSRDirty = true; + BranchStub[1] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<1>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + } + // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -362,23 +383,43 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -void Compiler::Comp_SpecialBranchBehaviour() +void Compiler::Comp_SpecialBranchBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); + if (taken && CurInstr.BranchFlags & branch_IdleBranch) + OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) + && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); + + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)&ARM_Ret, true); + } } } -JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... ResetBlockCache(); @@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Num = cpu->Num; CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; + // CPSR might have been modified in a previous block + CPSRDirty = Config::JIT_BrancheOptimisations == 2; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - - MOV(64, R(RCPU), ImmPtr(cpu)); - - LoadCPSR(); - RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] else (this->*comp)(); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); if (CurInstr.Cond() < 0xE) { @@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); SetJumpTarget(skipFailed); } @@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } } - if (comp == NULL && i != instrsCount - 1) + if (comp == NULL) LoadCPSR(); } RegCache.Flush(); - SaveCPSR(); - MOV(32, R(RAX), Imm32(ConstantCycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 + && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) + && (!instrs[instrsCount - 1].Info.Branches() + || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken + || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)ARM_Ret, true); + } /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] return res; } +void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + JMP((u8*)entry, true); + SetCodePtr(curPtr); +} + +void Compiler::UnlinkBlock(u32 offset) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + NOP(5); + SetCodePtr(curPtr); +} + void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? @@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) if (!Thumb && CurInstr.Cond() < 0xE) { LEA(32, RSCRATCH, MDisp(i, add + cycles)); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); } else { ConstantCycles += i + cycles; - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); } } @@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD() } if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 2cb57dc..b428c33 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -51,7 +51,10 @@ public: void Reset(); - JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + void LinkBlock(u32 offset, JitBlockEntry entry); + void UnlinkBlock(u32 offset); + + JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -145,7 +148,7 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); - void Comp_SpecialBranchBehaviour(); + void Comp_SpecialBranchBehaviour(bool taken); void* Gen_MemoryRoutine9(bool store, int size); @@ -176,12 +179,24 @@ public: return Gen::R(RegCache.Mapping[reg]); } + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(ResetStart + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - ResetStart; + } + u8* ResetStart; u32 CodeMemSize; bool Exit; bool IrregularCycles; + void* BranchStub[2]; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2]; diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp new file mode 100644 index 0000000..9696d22 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp @@ -0,0 +1,15 @@ +#include "../ARM.h" + +int main(int argc, char* argv[]) +{ + FILE* f = fopen("ARMJIT_Offsets.h", "w"); +#define writeOffset(field) \ + fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field)) + + writeOffset(CPSR); + writeOffset(Cycles); + writeOffset(StopExecution); + + fclose(f); + return 0; +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s new file mode 100644 index 0000000..dbbb024 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Linkage.s @@ -0,0 +1,74 @@ +.intel_syntax noprefix + +#include "ARMJIT_Offsets.h" + +.text + +#define RCPU rbp +#define RCPSR r15d + +#ifdef WIN64 +#define ARG1_REG ecx +#define ARG2_REG edx +#define ARG3_REG r8d +#define ARG4_REG r9d +#define ARG1_REG64 rcx +#define ARG2_REG64 rdx +#define ARG3_REG64 r8 +#define ARG4_REG64 r9 +#else +#define ARG1_REG edi +#define ARG2_REG esi +#define ARG3_REG edx +#define ARG4_REG ecx +#define ARG1_REG64 rdi +#define ARG2_REG64 rsi +#define ARG3_REG64 rdx +#define ARG4_REG64 rcx +#endif + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: +#ifdef WIN64 + push rdi + push rsi +#endif + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + +#ifdef WIN64 + sub rsp, 0x28 +#endif + mov RCPU, ARG1_REG64 + mov RCPSR, [RCPU + ARM_CPSR_offset] + + jmp ARG2_REG64 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + mov [RCPU + ARM_CPSR_offset], RCPSR + +#ifdef WIN64 + add rsp, 0x28 +#endif + + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx +#ifdef WIN64 + pop rsi + pop rdi +#endif + + ret diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h new file mode 100644 index 0000000..a73dd59 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Offsets.h @@ -0,0 +1,3 @@ +#define ARM_CPSR_offset 0x64 +#define ARM_Cycles_offset 0xc +#define ARM_StopExecution_offset 0x10 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 912299d..f650f42 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,9 +30,13 @@ add_library(core STATIC SPU.cpp Wifi.cpp WifiAP.cpp + + xxhash/xxhash.c ) if (ENABLE_JIT) + enable_language(ASM) + target_sources(core PRIVATE ARMJIT.cpp @@ -49,7 +53,10 @@ if (ENABLE_JIT) ARMJIT_x64/ARMJIT_ALU.cpp ARMJIT_x64/ARMJIT_LoadStore.cpp ARMJIT_x64/ARMJIT_Branch.cpp + + ARMJIT_x64/ARMJIT_Linkage.s ) + set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() if (ARCHITECTURE STREQUAL ARM64) target_sources(core PRIVATE diff --git a/src/Config.cpp b/src/Config.cpp index be6a833..f3f8c6c 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -35,10 +35,10 @@ int GL_ScaleFactor; int GL_Antialias; #ifdef JIT_ENABLED -bool JIT_Enable = false; +int JIT_Enable = false; int JIT_MaxBlockSize = 12; -bool JIT_BrancheOptimisations = true; -bool JIT_LiteralOptimisations = true; +int JIT_BrancheOptimisations = 2; +int JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -52,7 +52,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, - {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif diff --git a/src/Config.h b/src/Config.h index 723ab13..fff476a 100644 --- a/src/Config.h +++ b/src/Config.h @@ -47,10 +47,10 @@ extern int GL_ScaleFactor; extern int GL_Antialias; #ifdef JIT_ENABLED -extern bool JIT_Enable; +extern int JIT_Enable; extern int JIT_MaxBlockSize; -extern bool JIT_BrancheOptimisations; -extern bool JIT_LiteralOptimisations; +extern int JIT_BrancheOptimisations; +extern int JIT_LiteralOptimisations; #endif } diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h new file mode 100644 index 0000000..5d5faf8 --- /dev/null +++ b/src/xxhash/xxh3.h @@ -0,0 +1,2390 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Development source file for `xxh3` + * Copyright (C) 2019-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Note: This file is separated for development purposes. + * It will be integrated into `xxhash.h` when development stage is completed. + * + * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 + */ + +#ifndef XXH3_H_1397135465 +#define XXH3_H_1397135465 + +/* === Dependencies === */ +#ifndef XXHASH_H_5627135585666179 +/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ +# undef XXH_INLINE_ALL /* avoid redefinition */ +# define XXH_INLINE_ALL +#endif +#include "xxhash.h" + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXH_SCALAR 0 /* Portable scalar version */ +#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ +#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ +#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ +#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ +#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */ + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXH_VECTOR XXH_NEON +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator. + * This is for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/* + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) \ + && !defined(__aarch64__) && !defined(__arm64__) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +# if defined(__s390x__) +# include +# else +# include +# endif + +# undef vector /* Undo the pollution */ + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/* + * Performs an unaligned load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/* Pseudorandom secret taken directly from FARSH */ +XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * Calculates a 32-bit to 64-bit long multiply. + * + * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) + * { + * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); + * } + */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/* + * Calculates a 64->128-bit long multiply. + * + * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/* + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/* Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * We don't need to (or want to) mix as much as XXH64. + * + * Short hashes are more evenly distributed, so it isn't necessary. + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + xxh_u64 const mixed = keyed * PRIME64_1; + return XXH3_avalanche(mixed); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len < 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 x = input64 ^ bitflip; + /* this mix is inspired by Pelle Evensen's rrmxmx */ + x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); + x *= 0x9FB21C651E98DF25ULL; + x ^= (x >> 35) + len ; + x *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(x, 28); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(8 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + __asm__ ("" : "+r" (seed64)); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); + + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret, + XXH3_accWidth_e accWidth) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[0] += data_vec; */ + __m512i const sum = _mm512_add_epi64(*xacc, data_vec); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_SSE2) + + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + if (accWidth == XXH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } + +#elif (XXH_VECTOR == XXH_VSX) + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXH3_acc_128bits */ + /* swap high and low halves */ +#ifdef __s390x__ + xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); +#else + xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_SSE2) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + + /* xacc[i] *= PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + } + } } + +#elif (XXH_VECTOR == XXH_VSX) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } + +#else /* scalar variant of Scrambler - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXH_PREFETCH_DIST 384 + +#ifdef __clang__ // for clang +# define XXH_PREFETCH_DIST_AVX512_64 320 +# define XXH_PREFETCH_DIST_AVX512_128 320 +#else // for gcc +# define XXH_PREFETCH_DIST_AVX512_64 640 +# define XXH_PREFETCH_DIST_AVX512_128 512 +#endif + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; +#if (XXH_VECTOR == XXH_AVX512) + if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64); + else XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128); +#else + XXH_PREFETCH(in + XXH_PREFETCH_DIST); +#endif + XXH3_accumulate_512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE, + accWidth); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; + /* Do not align on 8, so that the secret is different from the scrambler */ +#define XXH_SECRET_LASTACC_START 7 + XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + __asm__("" : "+r" (result64)); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + /* + * We need a separate pointer for the hack below. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8 *kSecretPtr = kSecret; + + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that kSecretPtr has been changed), the pipelines are used more efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + __asm__("" : "+r" (kSecretPtr)); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == kSecret); + + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64(customSecret + 16*i, lo); + XXH_writeLE64(customSecret + 16*i + 8, hi); + } +} + + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) +{ + return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + return XXH3_hashLong_64b_internal(input, len, secret, secretSize); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); +} + +/* === Public entry point === */ + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXH3 streaming === */ + + +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); +} + +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_64bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->acc[0] = PRIME32_3; + statePtr->acc[1] = PRIME64_1; + statePtr->acc[2] = PRIME64_2; + statePtr->acc[3] = PRIME64_3; + statePtr->acc[4] = PRIME64_4; + statePtr->acc[5] = PRIME32_2; + statePtr->acc[6] = PRIME64_5; + statePtr->acc[7] = PRIME32_1; + statePtr->seed = seed; + XXH_ASSERT(secret != NULL); + statePtr->secret = secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_FORCE_INLINE void +XXH3_consumeStripes( xxh_u64* acc, + XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, + const xxh_u8* input, size_t totalStripes, + const xxh_u8* secret, size_t secretLimit, + XXH3_accWidth_e accWidth) +{ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { + /* need a scrambling operation */ + size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); + XXH3_scrambleAcc(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); + *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); + *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; + } +} + +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* const bEnd = input + len; + + state->totalLen += len; + + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + /* input is now > XXH3_INTERNALBUFFER_SIZE */ + + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ + + /* + * There is some input left inside the internal buffer. + * Fill it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + state->bufferedSize = 0; + } + + /* Consume input by full buffer quantities */ + if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + input += XXH3_INTERNALBUFFER_SIZE; + } while (input<=limit); + } + + if (input < bEnd) { /* Some remaining input: buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= STRIPE_LEN) { + size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; + XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, totalNbStripes, + state->secret, state->secretLimit, + accWidth); + if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - STRIPE_LEN, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } + } else { /* bufferedSize < STRIPE_LEN */ + if (state->bufferedSize) { /* one last stripe */ + xxh_u8 lastStripe[STRIPE_LEN]; + size_t const catchupSize = STRIPE_LEN - state->bufferedSize; + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } } +} + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_64bits); + return XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + } + /* len <= XXH3_MIDSIZE_MAX: short code */ + if (state->seed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + xxh_u64 const mixedl = keyed_lo * PRIME64_1; + xxh_u64 const mixedh = keyed_hi * PRIME64_5; + XXH128_hash_t h128; + h128.low64 = XXH3_avalanche(mixedl); + h128.high64 = XXH3_avalanche(mixedh); + return h128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); + h128.high64 += m128.high64 * PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); + h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) +{ + return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, + const xxh_u8* secret, size_t secretSize) +{ + return XXH3_hashLong_128b_internal(input, len, secret, secretSize); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); +} + + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ + +/* + * All the functions are actually the same as for 64-bit streaming variant. + * The only difference is the finalizatiom routine. + */ + +static void +XXH3_128bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); +} + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_128bits); + XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + state->secret + state->secretLimit + STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH3_H_1397135465 */ diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c new file mode 100644 index 0000000..0fae88c --- /dev/null +++ b/src/xxhash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h new file mode 100644 index 0000000..67a5887 --- /dev/null +++ b/src/xxhash/xxhash.h @@ -0,0 +1,1965 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* TODO: update */ +/* Notice extracted from xxHash homepage: + +xxHash is an extremely fast hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +Note: SMHasher's CRC32 implementation is not the fastest one. +Other speed-oriented implementations can be faster, +especially in combination with PCLMUL instruction: +https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such + * as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ +# ifdef XXH_NAMESPACE +# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" + /* + * Note: Alternative: #undef all symbols (it's a pretty large list). + * Without #error: it compiles, but functions are actually not inlined. + */ +# endif +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, but they must + * still be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and is a more dispersed action. + * Meanwhile, renaming can be achieved in a single block + */ +# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +/*! + * XXH_NAMESPACE, aka Namespace Emulation: + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 7 +#define XXH_VERSION_RELEASE 4 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +# endif +#endif + +/*! + * XXH32(): + * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + * The memory between input & input+length must be valid (allocated and read-accessible). + * "seed" can be used to alter the result predictably. + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +/******* Streaming *******/ + +/* + * Streaming functions generate the xxHash value from an incrememtal input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + */ + +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +#endif + +/*! + * XXH64(): + * Returns the 64-bit hash of sequence of length @length stored at memory + * address @input. + * @seed can be used to alter the result predictably. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation of an XXH + * state, for example, on the stack or in a struct. + * Never **ever** access members directly. + */ + +struct XXH32_state_s { + XXH32_hash_t total_len_32; + XXH32_hash_t large_len; + XXH32_hash_t v1; + XXH32_hash_t v2; + XXH32_hash_t v3; + XXH32_hash_t v4; + XXH32_hash_t mem32[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +struct XXH64_state_s { + XXH64_hash_t total_len; + XXH64_hash_t v1; + XXH64_hash_t v2; + XXH64_hash_t v3; + XXH64_hash_t v4; + XXH64_hash_t mem64[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved32; /* required for padding anyway */ + XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + + +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +/* ************************************************************************ + * XXH3 is a new hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * In general, expect XXH3 to run about ~2x faster on large inputs and >3x + * faster on small ones compared to XXH64, though exact differences depend on + * the platform. + * + * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash + * on all platforms. + * + * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. + * + * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run + * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The 128-bit version adds additional strength, but it is slightly slower. + * + * The XXH3 algorithm is still in development. + * The results it produces may still change in future versions. + * + * Results produced by v0.7.x are not comparable with results from v0.7.y. + * However, the API is completely stable, and it can safely be used for + * ephemeral data (local sessions). + * + * Avoid storing values in long-term storage until the algorithm is finalized. + * + * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if + * everything remains fine, its current format will be "frozen" and become the + * final one. + * + * After which, return values of XXH3 and XXH128 will no longer change in + * future versions. + * + * XXH3's return values will be officially finalized upon reaching v0.8.0. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +#ifdef XXH_NAMESPACE +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) + +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) + +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +#endif + +/* XXH3_64bits(): + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional + * collision. + * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid trivial sequences, such as repeating sequences and especially '\0', + * as this can cancel out itself. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXH3_SECRET_SIZE_MIN 136 +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* + * XXH3_64bits_withSeed(): + * This variant generates a custom secret on the fly based on the default + * secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * Note: seed==0 produces the same results as XXH3_64bits(). + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); + + +/* streaming 64-bit */ + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +typedef struct XXH3_state_s XXH3_state_t; + +#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ +#define XXH3_INTERNALBUFFER_SIZE 256 +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /* used to store a custom secret generated from the seed. Makes state larger. + * Design might change */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + XXH32_hash_t bufferedSize; + XXH32_hash_t nbStripesPerBlock; + XXH32_hash_t nbStripesSoFar; + XXH32_hash_t secretLimit; + XXH32_hash_t reserved32; + XXH32_hash_t reserved32_2; + XXH64_hash_t totalLen; + XXH64_hash_t seed; + XXH64_hash_t reserved64; + /* note: there is some padding after due to alignment on 64 bytes */ + const unsigned char* secret; +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever possible. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + + +/* + * XXH3_64bits_reset(): + * Initialize with the default parameters. + * The result will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/* + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, and must outlive the hash streaming session, so + * be careful when using stack arrays. + * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); + + +/* 128-bit */ + +#ifdef XXH_NAMESPACE +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) + +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) + +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + +typedef struct { + XXH64_hash_t low64; + XXH64_hash_t high64; +} XXH128_hash_t; + +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); + + +/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * XXH128_cmp(): + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * return: >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 + */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be found in xxhash.c. + * + * However, code inlining requires the implementation to be visible to the + * compiler, usually within the header. + * + * As a workaround, xxhash.c used to be included within xxhash.h. This caused + * some issues with some build systems, especially ones which treat .c files + * as source files. + * + * Therefore, the implementation is now directly integrated within xxhash.h. + * Another small advantage is that xxhash.c is no longer needed in /include. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ +/*! + * XXH_FORCE_MEMORY_ACCESS: + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow to select a different access method for improved + * performance. + * Method 0 (default): + * Use `memcpy()`. Safe and portable. + * Method 1: + * `__attribute__((packed))` statement. It depends on compiler extensions + * and is therefore not portable. + * This method is safe if your compiler supports it, and *generally* as + * fast or faster than `memcpy`. + * Method 2: + * Direct access via cast. This method doesn't depend on the compiler but + * violates the C standard. + * It can generate buggy code on targets which do not support unaligned + * memory accesses. + * But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) + * Method 3: + * Byteshift. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. + * See https://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2 > 3) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*! + *XXH_ACCEPT_NULL_INPUT_POINTER: + * If the input pointer is NULL, xxHash's default behavior is to dereference it, + * triggering a segfault. + * When this macro is enabled, xxHash actively checks the input for a null pointer. + * If it is, the result for null input pointers is the same as a zero-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*! + * XXH_FORCE_ALIGN_CHECK: + * This is a minor performance trick, only useful with lots of very small keys. + * It means: check for aligned/unaligned input. + * The check costs one initial branch per hash; + * Set it to 0 when the input is guaranteed to be aligned or when alignment + * doesn't matter for performance. + * + * This option does not affect XXH3. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*! + * XXH_NO_INLINE_HINTS: + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ +#ifndef XXH_NO_INLINE_HINTS +# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ + || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +/*! + * XXH_REROLL: + * Whether to reroll XXH32_finalize, and XXH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. + */ +#ifndef XXH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXH_REROLL 1 +# else +# define XXH_REROLL 0 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! + * Modify the local functions below should you wish to use some other memory + * routines for malloc() and free() + */ +#include + +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void* p) { free(p); } + +/*! and for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) \ + || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXH_NO_INLINE static __attribute__((noinline)) +# else +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +# endif +# else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* + * DEBUGLEVEL is expected to be defined externally, typically via the compiler's + * command line options. The value must be a number. + */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + + +/* *** Memory access *** */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianess *** */ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/*! + * XXH_CPU_LITTLE_ENDIAN: + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, a runtime check (which is usually constant folded) + * is used instead. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +static int XXH_isLittleEndian(void) +{ + /* + * Nonstandard, but well-defined behavior in practice. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ + && __has_builtin(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= PRIME32_1; +#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * This inline assembly hack forces acc into a normal register. This is the + * only thing that prevents GCC and Clang from autovectorizing the XXH32 + * loop (pragmas and attributes don't work for some resason) without globally + * disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * How this hack works: + * __asm__("" // Declare an assembly block but don't declare any instructions + * : // However, as an Input/Output Operand, + * "+r" // constrain a read/write operand (+) as a general purpose register (r). + * (acc) // and set acc as the operand + * ); + * + * Because of the 'r', the compiler has promised that seed will be in a + * general purpose register and the '+' says that it will be 'read/write', + * so it has to assume it has changed. It is like volatile without all the + * loads and stores. + * + * Since the argument has to be in a normal register (not an SSE register), + * each time XXH32_round is called, it is impossible to vectorize. + */ + __asm__("" : "+r" (acc)); +#endif + return acc; +} + +/* mix all bits */ +static xxh_u32 XXH32_avalanche(xxh_u32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +static xxh_u32 +XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1 do { \ + h32 += (*ptr++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1; \ +} while (0) + +#define PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4; \ +} while (0) + + /* Compact rerolled version */ + if (XXH_REROLL) { + len &= 15; + while (len >= 4) { + PROCESS4; + len -= 4; + } + while (len > 0) { + PROCESS1; + --len; + } + return XXH32_avalanche(h32); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + XXH_ASSERT(0); + return h32; /* reaching this point is deemed impossible */ + } +} + +XXH_FORCE_INLINE xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)16; + } +#endif + + if (len>=16) { + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; + xxh_u32 v2 = seed + PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + xxh_u32 v1 = state->v1; + xxh_u32 v2 = state->v2; + xxh_u32 v3 = state->v3; + xxh_u32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + + +/*! + * XXH_REROLL_XXH64: + * Whether to reroll the XXH64_finalize() loop. + * + * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a + * performance gain on 64-bit hosts, as only one jump is required. + * + * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit + * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial + * to unroll. The code becomes ridiculously large (the largest function in the + * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is + * also slightly faster because it fits into cache better and is more likely + * to be inlined by the compiler. + * + * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. + */ +#ifndef XXH_REROLL_XXH64 +# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ + || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ + || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ + || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ + || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ + || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ +# define XXH_REROLL_XXH64 1 +# else +# define XXH_REROLL_XXH64 0 +# endif +#endif /* !defined(XXH_REROLL_XXH64) */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static xxh_u64 XXH64_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +static xxh_u64 +XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1_64 do { \ + h64 ^= (*ptr++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; \ +} while (0) + +#define PROCESS4_64 do { \ + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ + ptr += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; \ +} while (0) + +#define PROCESS8_64 do { \ + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ + ptr += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} while (0) + + /* Rerolled version for 32-bit targets is faster and much smaller. */ + if (XXH_REROLL || XXH_REROLL_XXH64) { + len &= 31; + while (len >= 8) { + PROCESS8_64; + len -= 8; + } + if (len >= 4) { + PROCESS4_64; + len -= 4; + } + while (len > 0) { + PROCESS1_64; + --len; + } + return XXH64_avalanche(h64); + } else { + switch(len & 31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + } + /* impossible to reach */ + XXH_ASSERT(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +XXH_FORCE_INLINE xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)32; + } +#endif + + if (len>=32) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; + xxh_u64 v2 = seed + PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved64, might be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = state->v1; + xxh_u64 v2 = state->v2; + xxh_u64 v3 = state->v3; + xxh_u64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + xxh_u64 const v1 = state->v1; + xxh_u64 const v2 = state->v2; + xxh_u64 const v3 = state->v3; + xxh_u64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} + + +/******* Canonical representation *******/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + + + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +#include "xxh3.h" + + +#endif /* XXH_NO_LONG_LONG */ + + +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} +#endif -- cgit v1.2.3 From d13d625f7363449c3fdc041b0a22005b92c83229 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 04:33:36 +0200 Subject: jit: make everything configurable --- src/ARM.cpp | 127 ++++++++++++++++++++++++++++----- src/ARM.h | 3 + src/ARMJIT.cpp | 21 ++++-- src/ARMJIT.h | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 14 ++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 + src/Config.cpp | 6 ++ src/Config.h | 3 + src/NDS.cpp | 26 ++++++- src/frontend/qt_sdl/PlatformConfig.cpp | 1 + 10 files changed, 171 insertions(+), 34 deletions(-) (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index baf8468..1cd4bb2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -532,7 +532,7 @@ void ARMv5::Execute() while (NDS::ARM9Timestamp < NDS::ARM9Target) { - /*if (CPSR & 0x20) // THUMB + if (CPSR & 0x20) // THUMB { // prefetch R[15] += 2; @@ -565,14 +565,8 @@ void ARMv5::Execute() } else AddCycles_C(); - }*/ - - /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4))) - printf("aaarg ungempappter raum %x\n", R[15]);*/ - - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); - + } + // TODO optimize this shit!!! if (Halted) { @@ -597,6 +591,58 @@ void ARMv5::Execute() Halted = 0; } +void ARMv5::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(0)) + { + Halted = 0; + if (NDS::IME[0] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM9Timestamp = NDS::ARM9Target; + return; + } + } + + while (NDS::ARM9Timestamp < NDS::ARM9Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + if (!ARMJIT::IsMapped(0, instrAddr)) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + printf("ARMv5 PC in non executable region %08X\n", R[15]); + return; + } + + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr); + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + if (Halted) + { + if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; + } + if (IRQ) TriggerIRQ(); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + } + + if (Halted == 2) + Halted = 0; +} + void ARMv4::Execute() { if (Halted) @@ -620,7 +666,7 @@ void ARMv4::Execute() while (NDS::ARM7Timestamp < NDS::ARM7Target) { - /*if (CPSR & 0x20) // THUMB + if (CPSR & 0x20) // THUMB { // prefetch R[15] += 2; @@ -648,13 +694,7 @@ void ARMv4::Execute() } else AddCycles_C(); - }*/ - - /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4))) - printf("aaarg ungempappter raum %x\n", R[15]);*/ - - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + } // TODO optimize this shit!!! if (Halted) @@ -679,3 +719,56 @@ void ARMv4::Execute() if (Halted == 2) Halted = 0; } + +void ARMv4::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(1)) + { + Halted = 0; + if (NDS::IME[1] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM7Timestamp = NDS::ARM7Target; + return; + } + } + + while (NDS::ARM7Timestamp < NDS::ARM7Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + if (!ARMJIT::IsMapped(1, instrAddr)) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + printf("ARMv4 PC in non executable region %08X\n", R[15]); + return; + } + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr); + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + // TODO optimize this shit!!! + if (Halted) + { + if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; + } + + if (IRQ) TriggerIRQ(); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; + } + + if (Halted == 2) + Halted = 0; +} \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index e0832e2..3b01ef3 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -52,6 +52,7 @@ public: } virtual void Execute() = 0; + virtual void ExecuteJIT() = 0; bool CheckCondition(u32 code) { @@ -159,6 +160,7 @@ public: void DataAbort(); void Execute(); + void ExecuteJIT(); // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -281,6 +283,7 @@ public: void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); + void ExecuteJIT(); u16 CodeRead16(u32 addr) { diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 47b425f..e8e6be0 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -2,6 +2,8 @@ #include +#include "Config.h" + #include "ARMJIT_x64/ARMJIT_Compiler.h" namespace ARMJIT @@ -125,18 +127,21 @@ CompiledBlock CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; - FetchedInstr instrs[12]; + if (Config::JIT_MaxBlockSize < 1) + Config::JIT_MaxBlockSize = 1; + if (Config::JIT_MaxBlockSize > 32) + Config::JIT_MaxBlockSize = 32; + + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 r15Initial = cpu->R[15]; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; - //printf("block %x %d\n", r15, thumb); do { r15 += thumb ? 2 : 4; instrs[i].Instr = nextInstr[0]; - //printf("%x %x\n", instrs[i].Instr, r15); instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; if (cpu->Num == 0) @@ -166,16 +171,16 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; - } while(!instrs[i - 1].Info.Branches() && i < 10); + } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize); CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); - InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block); + InsertBlock(cpu->Num, blockAddr, block); return block; } -void ResetBlocks() +void InvalidateBlockCache() { memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); @@ -185,6 +190,8 @@ void ResetBlocks() memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + + compiler->Reset(); } } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 45bb4ed..004256c 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -111,7 +111,7 @@ void DeInit(); CompiledBlock CompileBlock(ARM* cpu); -void ResetBlocks(); +void InvalidateBlockCache(); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 2b7ccd2..fe23859 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -336,13 +336,15 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { }; #undef F +void Compiler::Reset() +{ + SetCodePtr((u8*)ResetStart); +} + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) - { - ResetBlocks(); - SetCodePtr((u8*)ResetStart); - } + InvalidateBlockCache(); CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); @@ -355,7 +357,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); + ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -469,7 +471,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); return res; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index e04f96a..cd58012 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -22,6 +22,8 @@ class Compiler : public Gen::X64CodeBlock public: Compiler(); + void Reset(); + CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); diff --git a/src/Config.cpp b/src/Config.cpp index 5745f34..5c0892a 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,9 @@ char DSiBIOS7Path[1024]; char DSiFirmwarePath[1024]; char DSiNANDPath[1024]; +bool JIT_Enable = false; +int JIT_MaxBlockSize = 12; + ConfigEntry ConfigFile[] = { {"BIOS9Path", 1, BIOS9Path, 0, "", 1023}, @@ -48,6 +51,9 @@ ConfigEntry ConfigFile[] = {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023}, {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023}, + {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, + {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 3947598..9dda157 100644 --- a/src/Config.h +++ b/src/Config.h @@ -51,6 +51,9 @@ extern char DSiBIOS7Path[1024]; extern char DSiFirmwarePath[1024]; extern char DSiNANDPath[1024]; +extern bool JIT_Enable; +extern int JIT_MaxBlockSize; + } #endif // CONFIG_H diff --git a/src/NDS.cpp b/src/NDS.cpp index 4073536..cb85d13 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -566,7 +566,7 @@ void Reset() KeyCnt = 0; RCnt = 0; - ARMJIT::ResetBlocks(); + ARMJIT::InvalidateBlockCache(); NDSCart::Reset(); GBACart::Reset(); @@ -794,6 +794,11 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } + if (!file->Saving) + { + ARMJIT::InvalidateBlockCache(); + } + return true; } @@ -884,6 +889,7 @@ void RunSystem(u64 timestamp) } } +template u32 RunFrame() { FrameStartTimestamp = SysTimestamp; @@ -917,7 +923,10 @@ u32 RunFrame() } else { - ARM9->Execute(); + if (EnableJIT) + ARM9->ExecuteJIT(); + else + ARM9->Execute(); } RunTimers(0); @@ -940,7 +949,10 @@ u32 RunFrame() } else { - ARM7->Execute(); + if (EnableJIT) + ARM7->ExecuteJIT(); + else + ARM7->Execute(); } RunTimers(1); @@ -970,6 +982,14 @@ u32 RunFrame() return GPU::TotalScanlines; } +u32 RunFrame() +{ + if (Config::JIT_Enable) + return RunFrame(); + else + return RunFrame(); +} + void Reschedule(u64 target) { if (CurCPU == 0) diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp index 06128d7..bfb3f97 100644 --- a/src/frontend/qt_sdl/PlatformConfig.cpp +++ b/src/frontend/qt_sdl/PlatformConfig.cpp @@ -72,6 +72,7 @@ char MicWavPath[1024]; char LastROMFolder[1024]; +bool EnableJIT; ConfigEntry PlatformConfigFile[] = { -- cgit v1.2.3 From 86f2be7260f9a9b51efd7c795c28cdcfda775742 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 19:24:00 +0200 Subject: jit: add compile option --- CMakeLists.txt | 36 ++++++++++++++++++++++ src/ARM.cpp | 13 ++++---- src/ARM.h | 6 ++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 - src/CMakeLists.txt | 25 +++++++++------- src/CP15.cpp | 12 ++++++-- src/Config.cpp | 4 +++ src/Config.h | 2 ++ src/NDS.cpp | 26 ++++++++++++++++ src/dolphin/CodeBlock.h | 3 -- 11 files changed, 136 insertions(+), 53 deletions(-) (limited to 'src/Config.h') diff --git a/CMakeLists.txt b/CMakeLists.txt index 885f0dd..1e53c60 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,42 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +include(CheckSymbolExists) +function(detect_architecture symbol arch) + if (NOT DEFINED ARCHITECTURE) + set(CMAKE_REQUIRED_QUIET 1) + check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch}) + unset(CMAKE_REQUIRED_QUIET) + + # The output variable needs to be unique across invocations otherwise + # CMake's crazy scope rules will keep it defined + if (ARCHITECTURE_${arch}) + set(ARCHITECTURE "${arch}" PARENT_SCOPE) + set(ARCHITECTURE_${arch} 1 PARENT_SCOPE) + add_definitions(-DARCHITECTURE_${arch}=1) + endif() + endif() +endfunction() + +detect_architecture("__x86_64__" x86_64) +detect_architecture("__i386__" x86) +detect_architecture("__arm__" ARM) +detect_architecture("__aarch64__" ARM64) + +if (ARCHITECTURE STREQUAL x86_64) + option(ENABLE_JIT "Enable x64 JIT recompiler" ON) +endif() + +if (ENABLE_JIT) + add_definitions(-DJIT_ENABLED) +endif() + +if (CMAKE_BUILD_TYPE STREQUAL Release) + option(ENABLE_LTO "Enable link-time optimization" ON) +else() + option(ENABLE_LTO "Enable link-time optimization" OFF) +endif() + if (CMAKE_BUILD_TYPE STREQUAL Debug) add_compile_options(-Og) endif() diff --git a/src/ARM.cpp b/src/ARM.cpp index 1cd4bb2..bfe1890 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -81,15 +81,8 @@ ARMv4::ARMv4() : ARM(1) // } -namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];} - void ARM::Reset() { - FILE* blabla = fopen("fhhg", "w"); - for (int i = 0; i < ARMInstrInfo::ak_Count; i++) - fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]); - fclose(blabla); - Cycles = 0; Halted = 0; @@ -591,6 +584,7 @@ void ARMv5::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv5::ExecuteJIT() { if (Halted) @@ -642,6 +636,7 @@ void ARMv5::ExecuteJIT() if (Halted == 2) Halted = 0; } +#endif void ARMv4::Execute() { @@ -720,6 +715,7 @@ void ARMv4::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv4::ExecuteJIT() { if (Halted) @@ -771,4 +767,5 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; -} \ No newline at end of file +} +#endif \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index 3b01ef3..c3e7f44 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -52,7 +52,9 @@ public: } virtual void Execute() = 0; +#ifdef ENABLE_JIT virtual void ExecuteJIT() = 0; +#endif bool CheckCondition(u32 code) { @@ -160,7 +162,9 @@ public: void DataAbort(); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -283,7 +287,9 @@ public: void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif u16 CodeRead16(u32 addr) { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fe23859..18cb27e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -4,7 +4,10 @@ #include +#include "../dolphin/CommonFuncs.h" + #ifdef _WIN32 +#include #else #include #include @@ -32,8 +35,6 @@ const int RegisterCache::NativeRegsAvailable = #endif ; -int instructionPopularityARM[ARMInstrInfo::ak_Count]; - /* We'll repurpose this .bss memory @@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32]; Compiler::Compiler() { -#ifdef _WIN32 -#else - u64 pagesize = sysconf(_SC_PAGE_SIZE); -#endif - - u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize); - u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned; - -#ifdef _WIN32 -#else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); -#endif - - region = pageAligned; - region_size = alignedSize; - total_region_size = region_size; + { + #ifdef _WIN32 + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + + u64 pageSize = (u64)sysInfo.dwPageSize; + #else + u64 pageSize = sysconf(_SC_PAGE_SIZE); + #endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; + + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + + region = pageAligned; + region_size = alignedSize; + total_region_size = region_size; + } ClearCodeSpace(); - SetCodePtr(pageAligned); - - memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); - for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) @@ -118,7 +123,7 @@ Compiler::Compiler() SetJumpTarget(und); MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); RET(); - } + } { // RSCRATCH mode // ABI_PARAM2 reg n @@ -163,7 +168,10 @@ Compiler::Compiler() RET(); } - ResetStart = (void*)GetWritableCodePtr(); + // move the region forward to prevent overwriting the generated functions + region_size -= GetWritableCodePtr() - region; + total_region_size = region_size; + region = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { void Compiler::Reset() { - SetCodePtr((u8*)ResetStart); + ClearCodeSpace(); } CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) @@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; - if (!Thumb) - instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; - if (comp == NULL || i == instrsCount - 1) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index cd58012..0ce7d8d 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -132,7 +132,6 @@ public: return Gen::R(RegCache.Mapping[reg]); } - void* ResetStart; void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 75fa42c..bfc0ad9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,19 +49,22 @@ add_library(core STATIC WifiAP.cpp tiny-AES-c/aes.c +) - ARMJIT.cpp - ARMJIT_x64/ARMJIT_Compiler.cpp - ARMJIT_x64/ARMJIT_ALU.cpp - ARMJIT_x64/ARMJIT_LoadStore.cpp - ARMJIT_x64/ARMJIT_Branch.cpp +if (ENABLE_JIT) + target_sources(core PRIVATE + ARMJIT.cpp + ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp - dolphin/CommonFuncs.cpp - dolphin/x64ABI.cpp - dolphin/x64CPUDetect.cpp - dolphin/x64Emitter.cpp - dolphin/MemoryUtil.cpp -) + dolphin/CommonFuncs.cpp + dolphin/x64ABI.cpp + dolphin/x64CPUDetect.cpp + dolphin/x64Emitter.cpp + ) +endif() if (WIN32) target_link_libraries(core ole32 comctl32 ws2_32 opengl32) diff --git a/src/CP15.cpp b/src/CP15.cpp index 3e1c08b..5b5f935 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -813,7 +813,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -835,7 +837,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val) { DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -857,8 +861,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val) { DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -880,8 +886,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL; +#ifdef JIT_ENABLED + ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) diff --git a/src/Config.cpp b/src/Config.cpp index 5c0892a..33bab75 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,8 +37,10 @@ char DSiBIOS7Path[1024]; char DSiFirmwarePath[1024]; char DSiNANDPath[1024]; +#ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +#endif ConfigEntry ConfigFile[] = { @@ -51,8 +53,10 @@ ConfigEntry ConfigFile[] = {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023}, {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023}, +#ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, +#endif {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 9dda157..9296335 100644 --- a/src/Config.h +++ b/src/Config.h @@ -51,8 +51,10 @@ extern char DSiBIOS7Path[1024]; extern char DSiFirmwarePath[1024]; extern char DSiNANDPath[1024]; +#ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +#endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index cb85d13..7636a07 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -169,7 +169,9 @@ bool Init() ARM9 = new ARMv5(); ARM7 = new ARMv4(); +#ifdef JIT_ENABLED ARMJIT::Init(); +#endif DMAs[0] = new DMA(0, 0); DMAs[1] = new DMA(0, 1); @@ -203,7 +205,9 @@ void DeInit() delete ARM9; delete ARM7; +#ifdef JIT_ENABLED ARMJIT::DeInit(); +#endif for (int i = 0; i < 8; i++) delete DMAs[i]; @@ -566,7 +570,9 @@ void Reset() KeyCnt = 0; RCnt = 0; +#ifdef JIT_ENABLED ARMJIT::InvalidateBlockCache(); +#endif NDSCart::Reset(); GBACart::Reset(); @@ -794,10 +800,12 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } +#ifdef JIT_ENABLED if (!file->Saving) { ARMJIT::InvalidateBlockCache(); } +#endif return true; } @@ -923,9 +931,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM9->ExecuteJIT(); else +#endif ARM9->Execute(); } @@ -949,9 +959,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM7->ExecuteJIT(); else +#endif ARM7->Execute(); } @@ -984,9 +996,11 @@ u32 RunFrame() u32 RunFrame() { +#ifdef JIT_ENABLED if (Config::JIT_Enable) return RunFrame(); else +#endif return RunFrame(); } @@ -1998,7 +2012,9 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -2050,7 +2066,9 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -2118,7 +2136,9 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(0, addr); +#endif switch (addr & 0xFF000000) { @@ -2414,7 +2434,9 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2475,7 +2497,9 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2546,7 +2570,9 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(1, addr); +#endif switch (addr & 0xFF800000) { diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h index 31a8d93..e71cf6d 100644 --- a/src/dolphin/CodeBlock.h +++ b/src/dolphin/CodeBlock.h @@ -9,7 +9,6 @@ #include "Assert.h" #include "../types.h" -#include "MemoryUtil.h" namespace Common { @@ -41,8 +40,6 @@ public: CodeBlock() = default; virtual ~CodeBlock() { - if (region) - FreeCodeSpace(); } CodeBlock(const CodeBlock&) = delete; CodeBlock& operator=(const CodeBlock&) = delete; -- cgit v1.2.3 From 40b88ab05aeb7e5c5216f29f4004fb5797db04b5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:10:59 +0200 Subject: new block cache and much more... - more reliable code invalidation detection - blocks aren't stopped at any branch, but are being followed if possible to get larger blocks - idle loop recognition - optimised literal loads, load/store cycle counting and loads/stores from constant addresses --- src/ARM.cpp | 44 ++- src/ARM.h | 16 +- src/ARMInterpreter.h | 9 + src/ARMJIT.cpp | 755 ++++++++++++++++++++++++++++++------ src/ARMJIT.h | 141 ++----- src/ARMJIT_Internal.h | 198 ++++++++++ src/ARMJIT_RegisterCache.h | 36 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 16 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 184 +++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 51 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++---------------- src/ARM_InstrInfo.cpp | 47 ++- src/ARM_InstrInfo.h | 11 +- src/CP15.cpp | 12 +- src/Config.cpp | 2 + src/Config.h | 1 + src/NDS.cpp | 22 +- 18 files changed, 1529 insertions(+), 688 deletions(-) create mode 100644 src/ARMJIT_Internal.h (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 7caef75..1e75301 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -623,21 +623,26 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) { NDS::ARM9Timestamp = NDS::ARM9Target; } break; } - if (IRQ) TriggerIRQ(); - - NDS::ARM9Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -753,23 +758,28 @@ void ARMv4::ExecuteJIT() printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; // TODO optimize this shit!!! + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) { NDS::ARM7Timestamp = NDS::ARM7Target; } break; } - - if (IRQ) TriggerIRQ(); - - NDS::ARM7Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -779,6 +789,8 @@ void ARMv4::ExecuteJIT() void ARMv5::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) @@ -801,6 +813,8 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { NextInstr[0] = CodeRead16(R[15] - 2); diff --git a/src/ARM.h b/src/ARM.h index 811b2e0..b36120a 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -311,7 +311,7 @@ public: { *val = BusRead8(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -320,7 +320,7 @@ public: *val = BusRead16(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -329,7 +329,7 @@ public: *val = BusRead32(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -337,14 +337,14 @@ public: addr &= ~3; *val = BusRead32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { BusWrite8(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -353,7 +353,7 @@ public: BusWrite16(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -362,7 +362,7 @@ public: BusWrite32(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -370,7 +370,7 @@ public: addr &= ~3; BusWrite32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 85cadf3..686bdd6 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,122 +1,137 @@ #include "ARMJIT.h" #include +#include #include "Config.h" +#include "ARMJIT_Internal.h" #include "ARMJIT_x64/ARMJIT_Compiler.h" +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" + namespace ARMJIT { +#define JIT_DEBUGPRINT(msg, ...) + Compiler* compiler; -BlockCache cache; -#define DUP2(x) x, x +const u32 ExeMemRegionSizes[] = { + 0x8000, // Unmapped Region (dummy) + 0x8000, // ITCM + 4*1024*1024, // Main RAM + 0x8000, // SWRAM + 0xA4000, // LCDC + 0x8000, // ARM9 BIOS + 0x4000, // ARM7 BIOS + 0x10000, // ARM7 WRAM + 0x40000 // ARM7 WVRAM +}; -static ptrdiff_t JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), - /* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ DUP2(offsetof(BlockCache, SWRAM)), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ -1, - offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS)) - }, - //arm7 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)), - /* 1X*/ DUP2(-1), - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ offsetof(BlockCache, SWRAM), - offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(-1) - } +const u32 ExeMemRegionOffsets[] = { + 0, + 0x8000, + 0x10000, + 0x410000, + 0x418000, + 0x4BC000, + 0x4C4000, + 0x4C8000, + 0x4D8000, + 0x518000, }; -static u32 JIT_MASK[2][32] = { +#define DUP2(x) x, x + +const static ExeMemKind JIT_MEM[2][32] = { //arm9 { - /* 0X*/ DUP2(0x00007FFF), - /* 1X*/ DUP2(0x00007FFF), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ DUP2(0x00007FFF), - /* 4X*/ DUP2(0x00000000), - /* 5X*/ DUP2(0x00000000), - /* 6X*/ 0x00000000, - 0x000FFFFF, - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00007FFF) + /* 0X*/ DUP2(exeMem_ITCM), + /* 1X*/ DUP2(exeMem_ITCM), // mirror + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ DUP2(exeMem_SWRAM), + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ exeMem_Unmapped, + exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_ARM9_BIOS) }, //arm7 { - /* 0X*/ DUP2(0x00003FFF), - /* 1X*/ DUP2(0x00000000), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ 0x00007FFF, - 0x0000FFFF, - /* 4X*/ 0x00000000, - 0x0000FFFF, - /* 5X*/ DUP2(0x00000000), - /* 6X*/ DUP2(0x0003FFFF), - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00000000) + /* 0X*/ DUP2(exeMem_ARM7_BIOS), + /* 1X*/ DUP2(exeMem_Unmapped), + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ exeMem_SWRAM, + exeMem_ARM7_WRAM, + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, + DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_Unmapped) } }; #undef DUP2 +/* + translates address to pseudo physical address + - more compact, eliminates mirroring, everything comes in a row + - we only need one translation table +*/ +u32 AddrTranslate9[0x2000]; +u32 AddrTranslate7[0x4000]; -void Init() +JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; +AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +TinyVector JitBlocks; +JitBlock* RestoreCandidates[0x1000] = {NULL}; + +u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) { - memset(&cache, 0, sizeof(BlockCache)); + return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); +} +void Init() +{ for (int i = 0; i < 0x2000; i++) - cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) - + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + { + ExeMemKind kind = JIT_MEM[0][i >> 8]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); + } for (int i = 0; i < 0x4000; i++) - cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) - + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); + { + ExeMemKind kind = JIT_MEM[1][i >> 9]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); + } compiler = new Compiler(); } @@ -126,7 +141,7 @@ void DeInit() delete compiler; } -void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { @@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -CompiledBlock CompileBlock(ARM* cpu) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + } + else + { + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + NULL // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu) if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + if (!(cpu->Num == 0 + ? IsMapped<0>(blockAddr) + : IsMapped<1>(blockAddr))) + { + printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); + } + + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr<0>(blockAddr) + : TranslateAddr<1>(blockAddr); + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + + u32 addresseRanges[32] = {}; + u32 numAddressRanges = 0; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", + blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + + u32 lastSegmentStart = blockAddr; + do { r15 += thumb ? 2 : 4; + instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; @@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + u32 translatedAddr = (cpu->Num == 0 + ? TranslateAddr<0>(instrs[i].Addr) + : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addresseRanges[j] == translatedAddr) + { + returning = true; + break; + } + } + if (!returning) + addresseRanges[numAddressRanges++] = translatedAddr; + } if (cpu->Num == 0) { @@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i - 1].Info.EndBlock = true; i--; } - i++; + if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + u32 cond, target; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 offset = (target - blockAddr) / (thumb ? 2 : 4); + if (IsIdleLoop(instrs + offset, i - offset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + u32 targetPseudoPhysical = cpu->Num == 0 + ? TranslateAddr<0>(target) + : TranslateAddr<1>(target); + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); - if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) - floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); - floodFillSetFlags(instrs, i - 1, 0xF); + u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); + JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + bool mayRestore = true; + if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + { + RestoreCandidates[restoreSlot] = NULL; + if (prevBlock->NumInstrs == i) + { + for (int j = 0; j < i; j++) + { + if (prevBlock->Instrs()[j] != instrs[j].Instr) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; - CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); + if (prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } - if (cpu->Num == 0) - InsertBlock<0>(blockAddr, block); + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(i, numAddressRanges); + for (int j = 0; j < i; j++) + block->Instrs()[j] = instrs[j].Instr; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addresseRanges[j]; + + block->StartAddr = blockAddr; + block->PseudoPhysicalAddr = pseudoPhysicalAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + } else - InsertBlock<1>(blockAddr, block); + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addresseRanges[j] == block->AddressRanges()[j]); + CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + } + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - return block; + JitBlocks.Add(block); } -void InvalidateBlockCache() +void InvalidateByAddr(u32 pseudoPhysical) { - printf("Resetting JIT block cache...\n"); + JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + int startLength = range->Blocks.Length; + for (int i = 0; i < range->Blocks.Length; i++) + { + assert(range->Blocks.Length == startLength); + JitBlock* block = range->Blocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 256) != (pseudoPhysical / 256)) + { + AddressRange* otherRange = &CodeRanges[addr / 256]; + assert(otherRange != range); + assert(otherRange->Blocks.RemoveByValue(block)); + } + } + + assert(JitBlocks.RemoveByValue(block)); + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + if ((range->TimesInvalidated + 1) > range->TimesInvalidated) + range->TimesInvalidated++; + + range->Blocks.Clear(); +} + +void InvalidateByAddr7(u32 addr) +{ + u32 pseudoPhysical = TranslateAddr<1>(addr); + if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateITCM(u32 addr) +{ + u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; + if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateAll() +{ + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeRanges[addr / 256]; + range->Blocks.Clear(); + if (range->TimesInvalidated + 1 > range->TimesInvalidated) + range->TimesInvalidated++; + } + + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + + JitBlocks.Clear(); +} + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); + for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + { + if (RestoreCandidates[i]) + { + delete RestoreCandidates[i]; + RestoreCandidates[i] = NULL; + } + } + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 256].Blocks.Clear(); + CodeRanges[addr / 256].TimesInvalidated = 0; + } + delete block; + } + JitBlocks.Clear(); compiler->Reset(); } +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + if ((addr & 0xFF000000) == 0x04000000) + { + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size == 16) + { + if (store) + return (void*)Wifi::Write; + else + return (void*)Wifi::Read; + } + break; + } + } + return NULL; +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 7e448ef..1db4d66 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,142 +9,67 @@ namespace ARMJIT { -typedef u32 (*CompiledBlock)(); - -struct FetchedInstr +enum ExeMemKind { - u32 A_Reg(int pos) const - { - return (Instr >> pos) & 0xF; - } - - u32 T_Reg(int pos) const - { - return (Instr >> pos) & 0x7; - } - - u32 Cond() const - { - return Instr >> 28; - } - - u8 SetFlags; - u32 Instr; - u32 NextInstr[2]; - u32 Addr; - - u8 CodeCycles; - - ARMInstrInfo::Info Info; + exeMem_Unmapped = 0, + exeMem_ITCM, + exeMem_MainRAM, + exeMem_SWRAM, + exeMem_LCDC, + exeMem_ARM9_BIOS, + exeMem_ARM7_BIOS, + exeMem_ARM7_WRAM, + exeMem_ARM7_WVRAM, + exeMem_Count }; -/* - Copied from DeSmuME - Some names where changed to match the nomenclature of melonDS +extern const u32 ExeMemRegionOffsets[]; +extern const u32 ExeMemRegionSizes[]; - Since it's nowhere explained and atleast I needed some time to get behind it, - here's a summary on how it works: - more or less all memory locations from which code can be executed are - represented by an array of function pointers, which point to null or - a function which executes a block instructions starting from there. +typedef u32 (*JitBlockEntry)(); - The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which - a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB - are the sizes of the smallest contigous memory region mapped to the respective CPU. - Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, - we only need every second half word to be adressable. +extern u32 AddrTranslate9[0x2000]; +extern u32 AddrTranslate7[0x4000]; - In case a memory write hits mapped memory, the function block at this - address is set to null, so it's recompiled the next time it's executed. - - This method has disadvantages, namely that only writing to the - first instruction of a block marks it as invalid and that memory remapping - (SWRAM and VRAM) isn't taken into account. -*/ - -struct BlockCache -{ - CompiledBlock* AddrMapping9[0x2000] = {0}; - CompiledBlock* AddrMapping7[0x4000] = {0}; - - CompiledBlock MainRAM[4*1024*1024/2]; - CompiledBlock SWRAM[0x8000/2]; // Shared working RAM - CompiledBlock ARM9_ITCM[0x8000/2]; - CompiledBlock ARM9_LCDC[0xA4000/2]; - CompiledBlock ARM9_BIOS[0x8000/2]; - CompiledBlock ARM7_BIOS[0x4000/2]; - CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM -}; - -extern BlockCache cache; +const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... +extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; } template -inline CompiledBlock LookUpBlock(u32 addr) +inline u32 TranslateAddr(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } template -inline void Invalidate16(u32 addr) +inline JitBlockEntry LookUpBlock(u32 addr) { - if (IsMapped(addr)) - { - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; - } -} - -template -inline void Invalidate32(u32 addr) -{ - if (IsMapped(addr)) - { - if (num == 0) - { - CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; - page[(addr & 0x7FFF) >> 1] = NULL; - page[((addr + 2) & 0x7FFF) >> 1] = NULL; - } - else - { - CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; - } - } -} - -template -inline void InsertBlock(u32 addr, CompiledBlock func) -{ - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + return FastBlockAccess[TranslateAddr(addr) / 2]; } void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateAll(); + +void InvalidateITCM(u32 addr); +void InvalidateByAddr7(u32 addr); + +void CompileBlock(ARM* cpu); -void InvalidateBlockCache(); +void ResetBlockCache(); } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..4acb488 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,198 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include + +#include "ARMJIT.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2 +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 NextInstr[2]; + u32 Addr; + + u8 CodeCycles; + u8 DataCycles; + u8 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u32 Length = 0; // make it 32 bit so we don't need movzx + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(Data) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 numInstrs, u32 numAddresses) + { + NumInstrs = numInstrs; + NumAddresses = numAddresses; + Data = new u32[numInstrs + numAddresses]; + } + + ~JitBlock() + { + delete[] Data; + } + + u32 StartAddr; + u32 PseudoPhysicalAddr; + + u32 NumInstrs; + u32 NumAddresses; + + JitBlockEntry EntryPoint; + + u32* Instrs() + { return Data; } + u32* AddressRanges() + { return Data + NumInstrs; } + +private: + /* + 0.. Blocks; + u16 TimesInvalidated; +}; + +extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index fe2f203..ed6a2b7 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -60,15 +60,46 @@ public: assert("Welp!"); } + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + void Flush() { BitSet16 loadedSet(LoadedRegs); for (int reg : loadedSet) UnloadRegister(reg); + LiteralsLoaded = 0; } void Prepare(bool thumb, int i) { + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + u16 futureNeeded = 0; int ranking[16]; for (int j = 0; j < 16; j++) @@ -86,7 +117,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; + FetchedInstr Instr = Instrs[i]; u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -125,6 +156,9 @@ public: static const int NativeRegsAvailable; Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; u32 NativeRegsUsed = 0; u16 LoadedRegs = 0; u16 DirtyRegs = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f868ddf..14c223b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp() MOV(32, rd, op2); if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); if (S) { @@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); - if (op & 1) + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); @@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU() u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) - Comp_AddCycles_CI(1); + Comp_AddCycles_CI(1); // shift by reg else Comp_AddCycles_C(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cc7a3c4..0dedb3f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -16,9 +16,6 @@ int squeezePointer(T* ptr) void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { // we can simplify constant branches by a lot - // it's not completely safe to assume stuff like, which instructions to preload - // we'll see how it works out - IrregularCycles = true; u32 newPC; @@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { ARMv5* cpu9 = (ARMv5*)CurCPU; - u32 oldregion = R15 >> 24; - u32 newregion = addr >> 24; - u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; - MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - - bool setupRegion = newregion != oldregion; - if (setupRegion) - cpu9->SetupCodeMem(addr); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); if (addr & 0x1) { @@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cycles += cpu9->CodeCycles; } - MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); - cpu9->RegionCodeCycles = compileTimeCodeCycles; - if (setupRegion) - cpu9->SetupCodeMem(R15); } else { @@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeRegion = codeRegion; cpu7->CodeCycles = codeCycles; - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } if (addr & 0x1) { @@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = addr >> 15; } - MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else @@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); + Comp_SpecialBranchBehaviour(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d8ce1aa..25c55a3 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -72,12 +72,15 @@ Compiler::Compiler() for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) - { MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); - MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); - } } + MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; + MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; + MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; + MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; + MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; + MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -179,12 +182,13 @@ void Compiler::LoadCPSR() MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); } -void Compiler::SaveCPSR() +void Compiler::SaveCPSR(bool flagClean) { if (CPSRDirty) { MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); - CPSRDirty = false; + if (flagClean) + CPSRDirty = false; } } @@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) // invalidates RSCRATCH and RSCRATCH3 Gen::FixupBranch Compiler::CheckCondition(u32 cond) { + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); if (cond >= 0x8) { static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); @@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - return J_CC(CC_Z); + return J_CC(CC_Z, ldmStm); } else { // could have used a LUT, but then where would be the fun? TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - return J_CC(cond & 1 ? CC_NZ : CC_Z); + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); } } @@ -354,25 +361,34 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +void Compiler::Comp_SpecialBranchBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... - InvalidateBlockCache(); + ResetBlockCache(); ConstantCycles = 0; - Thumb = cpu->CPSR & 0x20; + Thumb = thumb; Num = cpu->Num; - CodeRegion = cpu->CodeRegion; + CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - - if (!(Num == 0 - ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) - : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) - { - printf("Trying to compile a block in unmapped memory\n"); - } + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); - // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; - if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + SaveCPSR(); + } } - + if (comp != NULL) RegCache.Prepare(Thumb, i); else @@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); } else (this->*comp)(); @@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } } else if (cond == 0xF) + { Comp_AddCycles_C(); + } else { IrregularCycles = false; @@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (cond < 0xE) skipExecute = CheckCondition(cond); - u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); } else (this->*comp)(); + Comp_SpecialBranchBehaviour(); + if (CurInstr.Cond() < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + SetJumpTarget(skipFailed); } else @@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + return res; } @@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } } +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index fcb2380..792ff66 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,6 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" namespace ARMJIT @@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +struct ComplexOperand +{ + ComplexOperand() + {} + + ComplexOperand(u32 imm) + : IsImm(true), Imm(imm) + {} + ComplexOperand(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; class Compiler : public Gen::XEmitter { @@ -24,7 +51,7 @@ public: void Reset(); - CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -39,6 +66,8 @@ public: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); enum { @@ -92,8 +121,17 @@ public: void T_Comp_BL_LONG_2(); void T_Comp_BL_Merged(); - void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + void Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -105,8 +143,9 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void Comp_SpecialBranchBehaviour(); + void* Gen_MemoryRoutine9(bool store, int size); - void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); void* Gen_MemoryRoutineSeq9(bool store, bool preinc); void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); @@ -117,10 +156,9 @@ public: Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); - Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); - void SaveCPSR(); + void SaveCPSR(bool flagClean = true); bool FlagsNZRequired() { return CurInstr.SetFlags & 0xC; } @@ -139,10 +177,11 @@ public: u8* ResetStart; u32 CodeMemSize; + bool Exit; bool IrregularCycles; void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2][2]; + void* MemoryFuncs7[3][2]; void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index bf8280d..13ca415 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -27,51 +27,7 @@ int squeezePointer(T* ptr) /* address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) - code cycles - ABI_PARAM3 */ - -#define CALC_CYCLES_9(numC, numD, scratch) \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ - CMP(32, R(numC), R(numD)); \ - CMOVcc(32, numD, R(numC), CC_G); \ - CMP(32, R(numD), R(scratch)); \ - CMOVcc(32, scratch, R(numD), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); -#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - LEA(32, scratch, MRegSum(numD, numC)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - if (!store) \ - ADD(32, R(numC), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } -#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - if (!store) \ - ADD(32, R(numD), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } - void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); FixupBranch insideITCM = J_CC(CC_B); - // cycle counting! - MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); - SHR(32, R(ABI_PARAM4), Imm8(12)); - MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); - CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) { if (size > 8) @@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) { MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + + // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! + static_assert(sizeof(AddressRange) == 16); + LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + JMP((u8*)InvalidateByAddr, true); + SetJumpTarget(noCode); } else { @@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) -{ - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); - AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); - if (store) - { - MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); - } - else - { - MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); - - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM7Write32, true); break; - case 16: JMP((u8*)NDS::ARM7Write16, true); break; - case 8: JMP((u8*)NDS::ARM7Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM7Read16, true); - } - else - JMP((u8*)NDS::ARM7Read8, true); - } - - return res; -} - #define MEMORY_SEQ_WHILE_COND \ if (!store) \ MOV(32, currentElement, R(EAX));\ @@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) ABI_PARAM1 address ABI_PARAM2 address where registers are stored ABI_PARAM3 how many values to read/write - ABI_PARAM4 code cycles Dolphin x64CodeEmitter is my favourite assembler */ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); - - FixupBranch finishIt1 = J(); + RET(); SetJumpTarget(insideDTCM); AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); @@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time - MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential - FixupBranch finishIt2 = J(); + RET(); SetJumpTarget(insideITCM); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); @@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { MOV(32, R(ABI_PARAM4), currentElement); MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + + ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(ABI_PARAM4), R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); + CALL((u8*)InvalidateByAddr); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + SetJumpTarget(noCode); } else MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); - MOV(32, R(ABI_PARAM2), Imm32(1)); - - SetJumpTarget(finishIt1); - SetJumpTarget(finishIt2); - - POP(ABI_PARAM4); - POP(ABI_PARAM3); - - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); - - CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) RET(); return res; @@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + RET(); - POP(ABI_PARAM4); - POP(ABI_PARAM3); + return res; +} - // TODO: optimise this - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); +#undef MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); +void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + CurCPU->DataRead16(addr & ~0x1, &val); + else + CurCPU->DataRead8(addr, &val); + CurCPU->R[15] = tmpR15; - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); + MOV(32, MapReg(rd), Imm32(val)); - return res; + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + Comp_AddCycles_CDI(); } -#undef CALC_CYCLES_9 -#undef MEMORY_SEQ_WHILE_COND +void fault(u32 a, u32 b) +{ + printf("actually not static! %x %x\n", a, b); +} -void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) +void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - IrregularCycles = true; + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - if (store) - MOV(32, R(ABI_PARAM2), rd); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - MOV(32, R(ABI_PARAM3), Imm32(cycles)); - CALL(Num == 0 - ? MemoryFuncs9[size >> 4][store] - : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; - if (!store) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { - if (signExtend) - MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + Comp_MemLoadLiteral(size, rd, + R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + } + else + { + OpArg rdMapped = MapReg(rd); + OpArg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memoryFunc = Num == 0 + ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] + : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; + + if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); + MOV(32, R(ABI_PARAM1), Imm32(R15)); + MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + CMP(32, R(RSCRATCH), Imm32(addr)); + FixupBranch eq = J_CC(CC_E); + CALL((void*)fault); + SetJumpTarget(eq);*/ + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + if (flags & memop_Store) + { + MOV(size, M(ptr), MapReg(rd)); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + + if (size == 32 && addr & ~0x3) + { + ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + } + } + + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memoryFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + X64Reg finalAddr = ABI_PARAM1; + if (flags & memop_Post) + { + MOV(32, R(ABI_PARAM1), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } else - MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + if (flags & memop_Store) + MOV(32, R(ABI_PARAM2), rdMapped); + + if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) + MOV(32, rdMapped, R(ABI_PARAM1)); + + if (inlinePreparation && size > 8) + AND(32, R(ABI_PARAM1), Imm8(addressMask)); + + CALL(memoryFunc); + + if (!(flags & memop_Store)) + { + if (inlinePreparation && size == 32) + { + if (constLocalROR32 == 4) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rdMapped); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else if (constLocalROR32 != 0) + ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); + } + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } } } @@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - // we need to make sure that the stack stays aligned to 16 bytes u32 stackAlloc = ((regsCount + 1) & ~1) * 8; - MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + Comp_AddCycles_CDI(); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + Comp_AddCycles_CD(); + if (regsCount & 1) PUSH(RSCRATCH); @@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc return offset; } -OpArg Compiler::A_Comp_GetMemWBOffset() -{ - if (!(CurInstr.Instr & (1 << 25))) - { - u32 imm = CurInstr.Instr & 0xFFF; - return Imm32(imm); - } - else - { - int op = (CurInstr.Instr >> 5) & 0x3; - int amount = (CurInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurInstr.A_Reg(0)); - bool carryUsed; - - return Comp_RegShiftImm(op, amount, rm, false, carryUsed); - } -} void Compiler::A_Comp_MemWB() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); bool load = CurInstr.Instr & (1 << 20); bool byte = CurInstr.Instr & (1 << 22); int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; - if (CurInstr.Instr & (1 << 24)) + ComplexOperand offset; + if (!(CurInstr.Instr & (1 << 25))) { - OpArg offset = A_Comp_GetMemWBOffset(); - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); + offset = ComplexOperand(CurInstr.Instr & 0xFFF); } else - MOV(32, R(ABI_PARAM1), rn); - - if (!(CurInstr.Instr & (1 << 24))) { - OpArg offset = A_Comp_GetMemWBOffset(); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); + offset = ComplexOperand(rm, op, amount); } - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); - if (load && CurInstr.A_Reg(12) == 15) - { - if (byte) - printf("!!! LDRB PC %08X\n", R15); - else - { - if (Num == 1) - AND(32, rd, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rd.GetSimpleReg()); - } - } + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::A_Comp_MemHalf() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); - - OpArg offset = CurInstr.Instr & (1 << 22) - ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : MapReg(CurInstr.A_Reg(0)); + ComplexOperand offset = CurInstr.Instr & (1 << 22) + ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : ComplexOperand(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf() if (size == 32 && Num == 1) return; // NOP - if (CurInstr.Instr & (1 << 24)) - { - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); - } - else - MOV(32, R(ABI_PARAM1), rn); - + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; if (!(CurInstr.Instr & (1 << 24))) - { - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); - } + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; - Comp_MemAccess(rd, signExtend, !load, size); - - if (load && CurInstr.A_Reg(12) == 15) - printf("!!! MemHalf op PC %08X\n", R15);; + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; bool byte = op & 0x1; - MOV_sum(32, ABI_PARAM1, rb, ro); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::A_Comp_LDM_STM() @@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM() void Compiler::T_Comp_MemImm() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::T_Comp_MemRegHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op != 0; int size = op != 1 ? 16 : 8; bool signExtend = op & 1; - MOV_sum(32, ABI_PARAM1, rb, ro); + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; - Comp_MemAccess(rd, signExtend, !load, size); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + size, flags); } void Compiler::T_Comp_MemImmHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 16); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + load ? 0 : memop_Store); } void Compiler::T_Comp_LoadPCRel() { - OpArg rd = MapReg(CurInstr.T_Reg(8)); u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - // hopefully this doesn't break - u32 val; CurCPU->DataRead32(addr, &val); - MOV(32, rd, Imm32(val)); + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); } void Compiler::T_Comp_MemSPRel() { u32 offset = (CurInstr.Instr & 0xFF) * 4; - OpArg rd = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 32); + Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + load ? 0 : memop_Store); } void Compiler::T_Comp_PUSH_POP() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 9239e29..0fbde26 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -36,7 +36,7 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMemory = 1 << 20, + A_WriteMem = 1 << 20 }; #define A_BIOP A_Read16 @@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak( const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); -const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); @@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 | A_WriteMemory +#define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double | A_WriteMemory +#define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 21) +#define tk(x) ((x) << 22) enum { T_Read0 = 1 << 0, @@ -210,6 +210,8 @@ enum { T_SetMaybeC = 1 << 18, T_ReadC = 1 << 19, T_SetC = 1 << 20, + + T_WriteMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); -const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG); -const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG); +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); -const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); -const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM); +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); -const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); -const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); -const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); -const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); @@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 21) & 0x3F; + res.Kind = (data >> 22) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_SetC) res.WriteFlags |= flag_C; + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 id = (cn<<8)|(cm<<4)|cpinfo; if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) { @@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) res.WriteFlags |= flag_C; + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d01c600..d02f168 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -226,18 +226,27 @@ enum flag_V = 1 << 0, }; +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_WaitForInterrupt +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 SpecialKind; + u8 ReadFlags; // lower 4 bits - set always // upper 4 bits - might set flag u8 WriteFlags; bool EndBlock; - bool Branches() + bool Branches() const { return DstRegs & (1 << 15); } diff --git a/src/CP15.cpp b/src/CP15.cpp index 5b5f935..8a9b31d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -562,9 +562,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + ARMJIT::InvalidateAll(); ICacheInvalidateAll(); return; case 0x751: + ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); ICacheInvalidateByAddr(val); return; case 0x752: @@ -814,7 +816,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -838,7 +840,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -862,8 +864,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -887,8 +888,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } diff --git a/src/Config.cpp b/src/Config.cpp index 33bab75..c117a41 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -40,6 +40,7 @@ char DSiNANDPath[1024]; #ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +bool JIT_BrancheOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -56,6 +57,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 9296335..c9013aa 100644 --- a/src/Config.h +++ b/src/Config.h @@ -54,6 +54,7 @@ extern char DSiNANDPath[1024]; #ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +extern bool JIT_BrancheOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 0bde139..0cfbd1a 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -575,7 +575,7 @@ void Reset() RCnt = 0; #ifdef JIT_ENABLED - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); #endif NDSCart::Reset(); @@ -807,7 +807,7 @@ bool DoSavestate(Savestate* file) #ifdef JIT_ENABLED if (!file->Saving) { - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); } #endif @@ -2016,10 +2016,6 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2070,10 +2066,6 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2140,10 +2132,6 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate32<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2439,7 +2427,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2502,7 +2490,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2575,7 +2563,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) -- cgit v1.2.3 From 441869a10567c2da3de210052cbe93d783a9ce83 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 18 Oct 2019 13:29:17 +0200 Subject: integrate changes from ARM64 backend and more - better handle LDM/STM in reg alloc - unify Halted and IRQ in anticipation for branch inlining - literal optimisations can be disabled in gui - jit blocks follow simple returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in Pokemon White) --- src/ARM.cpp | 40 ++++++++++++++++++----------- src/ARM.h | 13 +++++++--- src/ARMJIT.cpp | 50 +++++++++++++++++++++++++++++++------ src/ARMJIT_RegisterCache.h | 33 +++++++++++++++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 7 +++--- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++---- src/ARM_InstrInfo.cpp | 28 +++++++++++++++++++++ src/ARM_InstrInfo.h | 2 +- src/Config.cpp | 2 ++ src/Config.h | 1 + src/NDS.cpp | 4 +-- 11 files changed, 153 insertions(+), 43 deletions(-) (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 1e75301..2f4aa90 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -159,7 +159,7 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + file->Var32(&StopExecution); file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -632,16 +632,21 @@ void ARMv5::ExecuteJIT() NDS::ARM9Timestamp += Cycles; Cycles = 0; - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM9Timestamp = NDS::ARM9Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; } - break; } } @@ -769,16 +774,21 @@ void ARMv4::ExecuteJIT() Cycles = 0; // TODO optimize this shit!!! - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM7Timestamp = NDS::ARM7Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; } - break; } } diff --git a/src/ARM.h b/src/ARM.h index b36120a..96dd857 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -112,9 +112,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 19a5e70..0695b85 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -16,11 +16,13 @@ #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" +#include "NDSCart.h" namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* compiler; @@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); @@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } else { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) @@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } return false; } @@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu) CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; do { @@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; + if (instrs[i].Info.DstRegs & (1 << 14)) + hasLink = false; + if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); @@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu) { bool hasBranched = cpu->R[15] != r15; - u32 cond, target; - bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) @@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu) if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop - u32 offset = (target - blockAddr) / (thumb ? 2 : 4); - if (IsIdleLoop(instrs + offset, i - offset + 1)) + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } - else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 ? TranslateAddr<0>(target) : TranslateAddr<1>(target); + + if (link) + { + lr = linkAddr; + hasLink = true; + } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); @@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu) bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); JitBlock* prevBlock = RestoreCandidates[restoreSlot]; @@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if ((addr & 0xFF000000) == 0x04000000) { + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + /* unfortunately we can't map GPU2D this way since it's hidden inside an object diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index ed6a2b7..2222bc2 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -93,10 +93,12 @@ public: void Prepare(bool thumb, int i) { + FetchedInstr instr = Instrs[i]; + if (LoadedRegs & (1 << 15)) UnloadRegister(15); - BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); for (int reg : invalidedLiterals) UnloadLiteral(reg); @@ -108,6 +110,7 @@ public: { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); for (int reg : regsNeeded) ranking[reg]++; } @@ -117,8 +120,8 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -143,13 +146,31 @@ public: loadedSet.m_val = LoadedRegs; } + // we don't need to load a value which is always going to be overwritten BitSet16 needValueLoaded(needToBeLoaded); - if (thumb || Instr.Cond() >= 0xE) - needValueLoaded = BitSet16(Instr.Info.SrcRegs); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + writeRegs |= (1 << reg) & instr.Info.DstRegs; + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + + DirtyRegs |= writeRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index a994d34..fd38724 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -364,7 +364,7 @@ void Compiler::Reset() void Compiler::Comp_SpecialBranchBehaviour() { if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); if (CurInstr.BranchFlags & branch_FollowCondNotTaken) { @@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] { CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); @@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { - IrregularCycles = true; - s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; @@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD() IrregularCycles = true; } - if (!Thumb && CurInstr.Cond() < 0xE) + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index eb01c87..3799774 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,5 +1,6 @@ #include "ARMJIT_Compiler.h" +#include "../Config.h" using namespace Gen; @@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); Comp_MemLoadLiteral(size, rd, addr); @@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); bool inlinePreparation = Num == 1; u32 constLocalROR32 = 4; @@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) { u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); @@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (Config::JIT_LiteralOptimisations) + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + else + Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 1261bbe..8f8bd35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + res.NotStrictlyNeeded |= set; + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set; + res.SrcRegs |= set; + } + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) res.SpecialKind = special_LoadLiteral; + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + res.NotStrictlyNeeded |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + res.NotStrictlyNeeded |= set; + } if ((instr >> 28) < 0xE) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index c032a4f..2732181 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -236,7 +236,7 @@ enum struct Info { - u16 DstRegs, SrcRegs; + u16 DstRegs, SrcRegs, NotStrictlyNeeded; u16 Kind; u8 SpecialKind; diff --git a/src/Config.cpp b/src/Config.cpp index c117a41..a7d78cd 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -41,6 +41,7 @@ char DSiNANDPath[1024]; bool JIT_Enable = false; int JIT_MaxBlockSize = 12; bool JIT_BrancheOptimisations = true; +bool JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -58,6 +59,7 @@ ConfigEntry ConfigFile[] = {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c9013aa..1fcd9bb 100644 --- a/src/Config.h +++ b/src/Config.h @@ -55,6 +55,7 @@ extern char DSiNANDPath[1024]; extern bool JIT_Enable; extern int JIT_MaxBlockSize; extern bool JIT_BrancheOptimisations; +extern bool JIT_LiteralOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 0cfbd1a..7b6a450 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1211,9 +1211,9 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); if ((ConsoleType == 1) && cpu) - arm->IRQ |= (IE2 & IF2); + arm->IRQ |= !!(IE2 & IF2); } else { -- cgit v1.2.3 From 1c07932b40e6e072c6ea66c49889860252e45186 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 13:40:51 +0200 Subject: implement block linking + some refactoring currently only supported for x64 --- .gitignore | 2 + src/ARM.cpp | 37 +- src/ARM.h | 32 +- src/ARMJIT.cpp | 223 +++- src/ARMJIT.h | 10 +- src/ARMJIT_Internal.h | 24 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 23 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 140 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_GenOffsets.cpp | 15 + src/ARMJIT_x64/ARMJIT_Linkage.s | 74 ++ src/ARMJIT_x64/ARMJIT_Offsets.h | 3 + src/CMakeLists.txt | 6 + src/Config.cpp | 8 +- src/Config.h | 6 +- src/xxhash/xxh3.h | 2390 ++++++++++++++++++++++++++++++++++ src/xxhash/xxhash.c | 43 + src/xxhash/xxhash.h | 1965 ++++++++++++++++++++++++++++ 18 files changed, 4870 insertions(+), 150 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h create mode 100644 src/xxhash/xxh3.h create mode 100644 src/xxhash/xxhash.c create mode 100644 src/xxhash/xxhash.h (limited to 'src/Config.h') diff --git a/.gitignore b/.gitignore index dd81614..3c87740 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ melon_grc.h cmake-build cmake-build-debug .idea + +*.exe diff --git a/src/ARM.cpp b/src/ARM.cpp index 896bb5c..3eac74d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -252,15 +252,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x2) { NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; } else { NextInstr[0] = CodeRead32(addr, true); NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; } CPSR |= 0x20; @@ -273,9 +273,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; CPSR &= ~0x20; } @@ -315,7 +315,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead16(addr); NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; } @@ -328,7 +328,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead32(addr); NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; } @@ -587,7 +587,7 @@ void ARMv5::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM9Timestamp += Cycles; + NDS::ARM9Timestamp -= Cycles; Cycles = 0; } @@ -627,14 +627,16 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp += Cycles; - Cycles = 0; + NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); if (StopExecution) { @@ -728,7 +730,7 @@ void ARMv4::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM7Timestamp += Cycles; + NDS::ARM7Timestamp -= Cycles; Cycles = 0; } @@ -768,14 +770,15 @@ void ARMv4::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp += Cycles; - Cycles = 0; + NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); // TODO optimize this shit!!! if (StopExecution) diff --git a/src/ARM.h b/src/ARM.h index ccef265..b71102a 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -193,14 +193,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; + Cycles -= numC; } void AddCycles_CI(s32 numI) { // code+internal s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + Cycles -= numC + numI; } void AddCycles_CDI() @@ -211,9 +211,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void AddCycles_CD() @@ -223,9 +223,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void GetCodeMemRegion(u32 addr, NDS::MemRegion* region); @@ -387,13 +387,13 @@ public: void AddCycles_C() { // code only. this code fetch is sequential. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; } void AddCycles_CI(s32 num) { // code+internal. results in a nonseq code fetch. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; } void AddCycles_CDI() @@ -405,21 +405,21 @@ public: if ((DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else { numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } } else if (CodeRegion == 0x02) { numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD + 1; + Cycles -= numC + numD + 1; } } @@ -432,17 +432,17 @@ public: if ((DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else if (CodeRegion == 0x02) { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD; + Cycles -= numC + numD; } } }; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 208801e..cc8d4ce 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -2,6 +2,10 @@ #include #include +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" #include "Config.h" @@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = { u32 AddrTranslate9[0x2000]; u32 AddrTranslate7[0x4000]; -JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; AddressRange CodeRanges[ExeMemSpaceSize / 512]; -TinyVector JitBlocks; -JitBlock* RestoreCandidates[0x1000] = {NULL}; +std::unordered_map JitBlocks; -u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) +template +struct UnreliableHashTable { - return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); -} + struct Bucket + { + K KeyA, KeyB; + V ValA, ValB; + }; + + Bucket Table[Size]; + + void Reset() + { + for (int i = 0; i < Size; i++) + { + Table[i].ValA = Table[i].ValB = InvalidValue; + } + } + + UnreliableHashTable() + { + Reset(); + } + + V Insert(K key, V value) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA == value || bucket->ValB == value) + { + return InvalidValue; + } + else if (bucket->ValA == InvalidValue) + { + bucket->KeyA = key; + bucket->ValA = value; + } + else if (bucket->ValB == InvalidValue) + { + bucket->KeyB = key; + bucket->ValB = value; + } + else + { + V prevVal = bucket->ValB; + bucket->KeyB = bucket->KeyA; + bucket->ValB = bucket->ValA; + bucket->KeyA = key; + bucket->ValA = value; + return prevVal; + } + + return InvalidValue; + } + + void Remove(K key) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->KeyA == key && bucket->ValA != InvalidValue) + { + bucket->ValA = InvalidValue; + if (bucket->ValB != InvalidValue) + { + bucket->KeyA = bucket->KeyB; + bucket->ValA = bucket->ValB; + bucket->ValB = InvalidValue; + } + } + if (bucket->KeyB == key && bucket->ValB != InvalidValue) + bucket->ValB = InvalidValue; + } + + V LookUp(K addr) + { + u32 slot = XXH3_64bits(&addr, 4) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA != InvalidValue && bucket->KeyA == addr) + return bucket->ValA; + if (bucket->ValB != InvalidValue && bucket->KeyB == addr) + return bucket->ValB; + + return InvalidValue; + } +}; + +UnreliableHashTable RestoreCandidates; +UnreliableHashTable FastBlockLookUp; void Init() { @@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], - cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", + blockAddr, cpu->CPSR, pseudoPhysicalAddr, CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; @@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu) if (staticBranch) { + instrs[i].BranchFlags |= branch_StaticTarget; + bool isBackJump = false; if (hasBranched) { @@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); - u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); - JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; - if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + if (prevBlock) { - RestoreCandidates[restoreSlot] = NULL; + RestoreCandidates.Remove(pseudoPhysicalAddr); if (prevBlock->NumInstrs == i) { for (int j = 0; j < i; j++) @@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu) CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); } - FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - - JitBlocks.Add(block); + JitBlocks[pseudoPhysicalAddr] = block; + FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); } void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) @@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) } } - bool removed = JitBlocks.RemoveByValue(block); - assert(removed); + for (int j = 0; j < block->NumLinks(); j++) + compiler->UnlinkBlock(block->Links()[j]); - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + JitBlocks.erase(block->PseudoPhysicalAddr); + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); if (mayRestore) { - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } } if ((range->TimesInvalidated + 1) > range->TimesInvalidated) @@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr) void InvalidateAll() { JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - - for (int j = 0; j < block->NumAddresses; j++) + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + + for (int i = 0; i < block->NumAddresses; i++) { - u32 addr = block->AddressRanges()[j]; + u32 addr = block->AddressRanges()[i]; AddressRange* range = &CodeRanges[addr / 512]; range->Blocks.Clear(); if (range->TimesInvalidated + 1 > range->TimesInvalidated) range->TimesInvalidated++; } + for (int i = 0; i < block->NumLinks(); i++) + compiler->UnlinkBlock(block->Links()[i]); + block->ResetLinks(); - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } - JitBlocks.Clear(); + JitBlocks.clear(); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - - memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); - for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + + FastBlockLookUp.Reset(); + RestoreCandidates.Reset(); + for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { - if (RestoreCandidates[i]) + if (RestoreCandidates.Table[i].ValA) { - delete RestoreCandidates[i]; - RestoreCandidates[i] = NULL; + delete RestoreCandidates.Table[i].ValA; + RestoreCandidates.Table[i].ValA = NULL; + } + if (RestoreCandidates.Table[i].ValA) + { + delete RestoreCandidates.Table[i].ValB; + RestoreCandidates.Table[i].ValB = NULL; } } - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -788,11 +882,43 @@ void ResetBlockCache() } delete block; } - JitBlocks.Clear(); + JitBlocks.clear(); compiler->Reset(); } +JitBlockEntry LookUpBlockEntry(u32 addr) +{ + u32 entryOffset = FastBlockLookUp.LookUp(addr); + if (entryOffset != UINT32_MAX) + return compiler->AddEntryOffset(entryOffset); + + auto block = JitBlocks.find(addr); + if (block != JitBlocks.end()) + { + FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + return block->second->EntryPoint; + } + return NULL; +} + +template +void LinkBlock(ARM* cpu, u32 codeOffset) +{ + u32 targetPseudoPhys = TranslateAddr(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); + auto block = JitBlocks.find(targetPseudoPhys); + if (block == JitBlocks.end()) + { + CompileBlock(cpu); + block = JitBlocks.find(targetPseudoPhys); + } + + JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); + + block->second->AddLink(codeOffset); + compiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) @@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) return NULL; } -} \ No newline at end of file +} + +template void ARMJIT::LinkBlock<0>(ARM*, u32); +template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 09cc463..cab385f 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000]; extern u32 AddrTranslate7[0x4000]; const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) @@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr) return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } -template -inline JitBlockEntry LookUpBlock(u32 addr) -{ - return FastBlockAccess[TranslateAddr(addr) / 2]; -} +JitBlockEntry LookUpBlockEntry(u32 addr); + void Init(); void DeInit(); @@ -73,4 +69,6 @@ void ResetBlockCache(); } +extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); + #endif \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 0d6add9..66d1808 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -15,7 +15,8 @@ enum { branch_IdleBranch = 1 << 0, branch_FollowCondTaken = 1 << 1, - branch_FollowCondNotTaken = 1 << 2 + branch_FollowCondNotTaken = 1 << 2, + branch_StaticTarget = 1 << 3, }; struct FetchedInstr @@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector assert(capacity > Capacity); T* newMem = new T[capacity]; if (Data != NULL) - memcpy(newMem, Data, sizeof(Data) * Length); + memcpy(newMem, Data, sizeof(T) * Length); T* oldData = Data; Data = newMem; @@ -163,7 +164,6 @@ public: u32 NumInstrs; u32 NumAddresses; - u32 NumLinks; JitBlockEntry EntryPoint; @@ -171,6 +171,21 @@ public: { return &Data[0]; } u32* AddressRanges() { return &Data[NumInstrs]; } + u32* Links() + { return &Data[NumInstrs + NumAddresses]; } + + u32 NumLinks() + { return Data.Length - NumInstrs - NumAddresses; } + + void AddLink(u32 link) + { + Data.Add(link); + } + + void ResetLinks() + { + Data.SetLength(NumInstrs + NumAddresses); + } private: /* @@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000]; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); +template +void LinkBlock(ARM* cpu, u32 codeOffset); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index e02865d..cac590a 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) @@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) IrregularCycles = true; BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); - bool previouslyDirty = CPSRDirty; + bool cpsrDirty = CPSRDirty; SaveCPSR(); if (restoreCPSR) @@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) LoadReg(reg, RegCache.Mapping[reg]); } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + LoadCPSR(); + // in case this instruction is skipped + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } void Compiler::A_Comp_BranchImm() @@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); Comp_AddCycles_C(true); SetJumpTarget(skipFailed); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d69bdff..be3709e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -1,6 +1,7 @@ #include "ARMJIT_Compiler.h" #include "../ARMInterpreter.h" +#include "../Config.h" #include @@ -15,6 +16,8 @@ using namespace Gen; +extern "C" void ARM_Ret(); + namespace ARMJIT { template <> @@ -170,6 +173,24 @@ Compiler::Compiler() RET(); } + { + CPSRDirty = true; + BranchStub[0] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<0>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + + CPSRDirty = true; + BranchStub[1] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<1>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + } + // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -362,23 +383,43 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -void Compiler::Comp_SpecialBranchBehaviour() +void Compiler::Comp_SpecialBranchBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); + if (taken && CurInstr.BranchFlags & branch_IdleBranch) + OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) + && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); + + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)&ARM_Ret, true); + } } } -JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... ResetBlockCache(); @@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Num = cpu->Num; CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; + // CPSR might have been modified in a previous block + CPSRDirty = Config::JIT_BrancheOptimisations == 2; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - - MOV(64, R(RCPU), ImmPtr(cpu)); - - LoadCPSR(); - RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] else (this->*comp)(); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); if (CurInstr.Cond() < 0xE) { @@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); SetJumpTarget(skipFailed); } @@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } } - if (comp == NULL && i != instrsCount - 1) + if (comp == NULL) LoadCPSR(); } RegCache.Flush(); - SaveCPSR(); - MOV(32, R(RAX), Imm32(ConstantCycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 + && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) + && (!instrs[instrsCount - 1].Info.Branches() + || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken + || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)ARM_Ret, true); + } /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] return res; } +void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + JMP((u8*)entry, true); + SetCodePtr(curPtr); +} + +void Compiler::UnlinkBlock(u32 offset) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + NOP(5); + SetCodePtr(curPtr); +} + void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? @@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) if (!Thumb && CurInstr.Cond() < 0xE) { LEA(32, RSCRATCH, MDisp(i, add + cycles)); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); } else { ConstantCycles += i + cycles; - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); } } @@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD() } if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 2cb57dc..b428c33 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -51,7 +51,10 @@ public: void Reset(); - JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + void LinkBlock(u32 offset, JitBlockEntry entry); + void UnlinkBlock(u32 offset); + + JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -145,7 +148,7 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); - void Comp_SpecialBranchBehaviour(); + void Comp_SpecialBranchBehaviour(bool taken); void* Gen_MemoryRoutine9(bool store, int size); @@ -176,12 +179,24 @@ public: return Gen::R(RegCache.Mapping[reg]); } + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(ResetStart + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - ResetStart; + } + u8* ResetStart; u32 CodeMemSize; bool Exit; bool IrregularCycles; + void* BranchStub[2]; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2]; diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp new file mode 100644 index 0000000..9696d22 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp @@ -0,0 +1,15 @@ +#include "../ARM.h" + +int main(int argc, char* argv[]) +{ + FILE* f = fopen("ARMJIT_Offsets.h", "w"); +#define writeOffset(field) \ + fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field)) + + writeOffset(CPSR); + writeOffset(Cycles); + writeOffset(StopExecution); + + fclose(f); + return 0; +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s new file mode 100644 index 0000000..dbbb024 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Linkage.s @@ -0,0 +1,74 @@ +.intel_syntax noprefix + +#include "ARMJIT_Offsets.h" + +.text + +#define RCPU rbp +#define RCPSR r15d + +#ifdef WIN64 +#define ARG1_REG ecx +#define ARG2_REG edx +#define ARG3_REG r8d +#define ARG4_REG r9d +#define ARG1_REG64 rcx +#define ARG2_REG64 rdx +#define ARG3_REG64 r8 +#define ARG4_REG64 r9 +#else +#define ARG1_REG edi +#define ARG2_REG esi +#define ARG3_REG edx +#define ARG4_REG ecx +#define ARG1_REG64 rdi +#define ARG2_REG64 rsi +#define ARG3_REG64 rdx +#define ARG4_REG64 rcx +#endif + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: +#ifdef WIN64 + push rdi + push rsi +#endif + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + +#ifdef WIN64 + sub rsp, 0x28 +#endif + mov RCPU, ARG1_REG64 + mov RCPSR, [RCPU + ARM_CPSR_offset] + + jmp ARG2_REG64 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + mov [RCPU + ARM_CPSR_offset], RCPSR + +#ifdef WIN64 + add rsp, 0x28 +#endif + + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx +#ifdef WIN64 + pop rsi + pop rdi +#endif + + ret diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h new file mode 100644 index 0000000..a73dd59 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Offsets.h @@ -0,0 +1,3 @@ +#define ARM_CPSR_offset 0x64 +#define ARM_Cycles_offset 0xc +#define ARM_StopExecution_offset 0x10 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c34ba3b..a0c3a36 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,9 +49,12 @@ add_library(core STATIC WifiAP.cpp tiny-AES-c/aes.c + xxhash/xxhash.c ) if (ENABLE_JIT) + enable_language(ASM) + target_sources(core PRIVATE ARMJIT.cpp @@ -68,7 +71,10 @@ if (ENABLE_JIT) ARMJIT_x64/ARMJIT_ALU.cpp ARMJIT_x64/ARMJIT_LoadStore.cpp ARMJIT_x64/ARMJIT_Branch.cpp + + ARMJIT_x64/ARMJIT_Linkage.s ) + set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() if (ARCHITECTURE STREQUAL ARM64) target_sources(core PRIVATE diff --git a/src/Config.cpp b/src/Config.cpp index 07b1e3e..e69319b 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -38,10 +38,10 @@ char DSiFirmwarePath[1024]; char DSiNANDPath[1024]; #ifdef JIT_ENABLED -bool JIT_Enable = false; +int JIT_Enable = false; int JIT_MaxBlockSize = 12; -bool JIT_BrancheOptimisations = true; -bool JIT_LiteralOptimisations = true; +int JIT_BrancheOptimisations = 2; +int JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -58,7 +58,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, - {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif diff --git a/src/Config.h b/src/Config.h index 1fcd9bb..d546524 100644 --- a/src/Config.h +++ b/src/Config.h @@ -52,10 +52,10 @@ extern char DSiFirmwarePath[1024]; extern char DSiNANDPath[1024]; #ifdef JIT_ENABLED -extern bool JIT_Enable; +extern int JIT_Enable; extern int JIT_MaxBlockSize; -extern bool JIT_BrancheOptimisations; -extern bool JIT_LiteralOptimisations; +extern int JIT_BrancheOptimisations; +extern int JIT_LiteralOptimisations; #endif } diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h new file mode 100644 index 0000000..5d5faf8 --- /dev/null +++ b/src/xxhash/xxh3.h @@ -0,0 +1,2390 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Development source file for `xxh3` + * Copyright (C) 2019-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Note: This file is separated for development purposes. + * It will be integrated into `xxhash.h` when development stage is completed. + * + * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 + */ + +#ifndef XXH3_H_1397135465 +#define XXH3_H_1397135465 + +/* === Dependencies === */ +#ifndef XXHASH_H_5627135585666179 +/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ +# undef XXH_INLINE_ALL /* avoid redefinition */ +# define XXH_INLINE_ALL +#endif +#include "xxhash.h" + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXH_SCALAR 0 /* Portable scalar version */ +#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ +#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ +#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ +#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ +#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */ + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXH_VECTOR XXH_NEON +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator. + * This is for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/* + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) \ + && !defined(__aarch64__) && !defined(__arm64__) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +# if defined(__s390x__) +# include +# else +# include +# endif + +# undef vector /* Undo the pollution */ + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/* + * Performs an unaligned load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/* Pseudorandom secret taken directly from FARSH */ +XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * Calculates a 32-bit to 64-bit long multiply. + * + * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) + * { + * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); + * } + */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/* + * Calculates a 64->128-bit long multiply. + * + * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/* + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/* Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * We don't need to (or want to) mix as much as XXH64. + * + * Short hashes are more evenly distributed, so it isn't necessary. + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + xxh_u64 const mixed = keyed * PRIME64_1; + return XXH3_avalanche(mixed); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len < 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 x = input64 ^ bitflip; + /* this mix is inspired by Pelle Evensen's rrmxmx */ + x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); + x *= 0x9FB21C651E98DF25ULL; + x ^= (x >> 35) + len ; + x *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(x, 28); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(8 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + __asm__ ("" : "+r" (seed64)); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); + + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret, + XXH3_accWidth_e accWidth) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[0] += data_vec; */ + __m512i const sum = _mm512_add_epi64(*xacc, data_vec); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_SSE2) + + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + if (accWidth == XXH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } + +#elif (XXH_VECTOR == XXH_VSX) + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXH3_acc_128bits */ + /* swap high and low halves */ +#ifdef __s390x__ + xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); +#else + xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_SSE2) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + + /* xacc[i] *= PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + } + } } + +#elif (XXH_VECTOR == XXH_VSX) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } + +#else /* scalar variant of Scrambler - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXH_PREFETCH_DIST 384 + +#ifdef __clang__ // for clang +# define XXH_PREFETCH_DIST_AVX512_64 320 +# define XXH_PREFETCH_DIST_AVX512_128 320 +#else // for gcc +# define XXH_PREFETCH_DIST_AVX512_64 640 +# define XXH_PREFETCH_DIST_AVX512_128 512 +#endif + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; +#if (XXH_VECTOR == XXH_AVX512) + if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64); + else XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128); +#else + XXH_PREFETCH(in + XXH_PREFETCH_DIST); +#endif + XXH3_accumulate_512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE, + accWidth); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; + /* Do not align on 8, so that the secret is different from the scrambler */ +#define XXH_SECRET_LASTACC_START 7 + XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + __asm__("" : "+r" (result64)); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + /* + * We need a separate pointer for the hack below. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8 *kSecretPtr = kSecret; + + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that kSecretPtr has been changed), the pipelines are used more efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + __asm__("" : "+r" (kSecretPtr)); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == kSecret); + + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64(customSecret + 16*i, lo); + XXH_writeLE64(customSecret + 16*i + 8, hi); + } +} + + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) +{ + return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + return XXH3_hashLong_64b_internal(input, len, secret, secretSize); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); +} + +/* === Public entry point === */ + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXH3 streaming === */ + + +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); +} + +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_64bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->acc[0] = PRIME32_3; + statePtr->acc[1] = PRIME64_1; + statePtr->acc[2] = PRIME64_2; + statePtr->acc[3] = PRIME64_3; + statePtr->acc[4] = PRIME64_4; + statePtr->acc[5] = PRIME32_2; + statePtr->acc[6] = PRIME64_5; + statePtr->acc[7] = PRIME32_1; + statePtr->seed = seed; + XXH_ASSERT(secret != NULL); + statePtr->secret = secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_FORCE_INLINE void +XXH3_consumeStripes( xxh_u64* acc, + XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, + const xxh_u8* input, size_t totalStripes, + const xxh_u8* secret, size_t secretLimit, + XXH3_accWidth_e accWidth) +{ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { + /* need a scrambling operation */ + size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); + XXH3_scrambleAcc(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); + *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); + *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; + } +} + +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* const bEnd = input + len; + + state->totalLen += len; + + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + /* input is now > XXH3_INTERNALBUFFER_SIZE */ + + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ + + /* + * There is some input left inside the internal buffer. + * Fill it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + state->bufferedSize = 0; + } + + /* Consume input by full buffer quantities */ + if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + input += XXH3_INTERNALBUFFER_SIZE; + } while (input<=limit); + } + + if (input < bEnd) { /* Some remaining input: buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= STRIPE_LEN) { + size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; + XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, totalNbStripes, + state->secret, state->secretLimit, + accWidth); + if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - STRIPE_LEN, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } + } else { /* bufferedSize < STRIPE_LEN */ + if (state->bufferedSize) { /* one last stripe */ + xxh_u8 lastStripe[STRIPE_LEN]; + size_t const catchupSize = STRIPE_LEN - state->bufferedSize; + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } } +} + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_64bits); + return XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + } + /* len <= XXH3_MIDSIZE_MAX: short code */ + if (state->seed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + xxh_u64 const mixedl = keyed_lo * PRIME64_1; + xxh_u64 const mixedh = keyed_hi * PRIME64_5; + XXH128_hash_t h128; + h128.low64 = XXH3_avalanche(mixedl); + h128.high64 = XXH3_avalanche(mixedh); + return h128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); + h128.high64 += m128.high64 * PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); + h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) +{ + return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, + const xxh_u8* secret, size_t secretSize) +{ + return XXH3_hashLong_128b_internal(input, len, secret, secretSize); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); +} + + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ + +/* + * All the functions are actually the same as for 64-bit streaming variant. + * The only difference is the finalizatiom routine. + */ + +static void +XXH3_128bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); +} + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_128bits); + XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + state->secret + state->secretLimit + STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH3_H_1397135465 */ diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c new file mode 100644 index 0000000..0fae88c --- /dev/null +++ b/src/xxhash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h new file mode 100644 index 0000000..67a5887 --- /dev/null +++ b/src/xxhash/xxhash.h @@ -0,0 +1,1965 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* TODO: update */ +/* Notice extracted from xxHash homepage: + +xxHash is an extremely fast hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +Note: SMHasher's CRC32 implementation is not the fastest one. +Other speed-oriented implementations can be faster, +especially in combination with PCLMUL instruction: +https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such + * as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ +# ifdef XXH_NAMESPACE +# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" + /* + * Note: Alternative: #undef all symbols (it's a pretty large list). + * Without #error: it compiles, but functions are actually not inlined. + */ +# endif +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, but they must + * still be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and is a more dispersed action. + * Meanwhile, renaming can be achieved in a single block + */ +# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +/*! + * XXH_NAMESPACE, aka Namespace Emulation: + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 7 +#define XXH_VERSION_RELEASE 4 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +# endif +#endif + +/*! + * XXH32(): + * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + * The memory between input & input+length must be valid (allocated and read-accessible). + * "seed" can be used to alter the result predictably. + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +/******* Streaming *******/ + +/* + * Streaming functions generate the xxHash value from an incrememtal input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + */ + +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +#endif + +/*! + * XXH64(): + * Returns the 64-bit hash of sequence of length @length stored at memory + * address @input. + * @seed can be used to alter the result predictably. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation of an XXH + * state, for example, on the stack or in a struct. + * Never **ever** access members directly. + */ + +struct XXH32_state_s { + XXH32_hash_t total_len_32; + XXH32_hash_t large_len; + XXH32_hash_t v1; + XXH32_hash_t v2; + XXH32_hash_t v3; + XXH32_hash_t v4; + XXH32_hash_t mem32[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +struct XXH64_state_s { + XXH64_hash_t total_len; + XXH64_hash_t v1; + XXH64_hash_t v2; + XXH64_hash_t v3; + XXH64_hash_t v4; + XXH64_hash_t mem64[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved32; /* required for padding anyway */ + XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + + +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +/* ************************************************************************ + * XXH3 is a new hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * In general, expect XXH3 to run about ~2x faster on large inputs and >3x + * faster on small ones compared to XXH64, though exact differences depend on + * the platform. + * + * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash + * on all platforms. + * + * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. + * + * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run + * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The 128-bit version adds additional strength, but it is slightly slower. + * + * The XXH3 algorithm is still in development. + * The results it produces may still change in future versions. + * + * Results produced by v0.7.x are not comparable with results from v0.7.y. + * However, the API is completely stable, and it can safely be used for + * ephemeral data (local sessions). + * + * Avoid storing values in long-term storage until the algorithm is finalized. + * + * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if + * everything remains fine, its current format will be "frozen" and become the + * final one. + * + * After which, return values of XXH3 and XXH128 will no longer change in + * future versions. + * + * XXH3's return values will be officially finalized upon reaching v0.8.0. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +#ifdef XXH_NAMESPACE +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) + +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) + +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +#endif + +/* XXH3_64bits(): + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional + * collision. + * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid trivial sequences, such as repeating sequences and especially '\0', + * as this can cancel out itself. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXH3_SECRET_SIZE_MIN 136 +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* + * XXH3_64bits_withSeed(): + * This variant generates a custom secret on the fly based on the default + * secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * Note: seed==0 produces the same results as XXH3_64bits(). + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); + + +/* streaming 64-bit */ + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +typedef struct XXH3_state_s XXH3_state_t; + +#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ +#define XXH3_INTERNALBUFFER_SIZE 256 +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /* used to store a custom secret generated from the seed. Makes state larger. + * Design might change */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + XXH32_hash_t bufferedSize; + XXH32_hash_t nbStripesPerBlock; + XXH32_hash_t nbStripesSoFar; + XXH32_hash_t secretLimit; + XXH32_hash_t reserved32; + XXH32_hash_t reserved32_2; + XXH64_hash_t totalLen; + XXH64_hash_t seed; + XXH64_hash_t reserved64; + /* note: there is some padding after due to alignment on 64 bytes */ + const unsigned char* secret; +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever possible. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + + +/* + * XXH3_64bits_reset(): + * Initialize with the default parameters. + * The result will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/* + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, and must outlive the hash streaming session, so + * be careful when using stack arrays. + * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); + + +/* 128-bit */ + +#ifdef XXH_NAMESPACE +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) + +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) + +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + +typedef struct { + XXH64_hash_t low64; + XXH64_hash_t high64; +} XXH128_hash_t; + +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); + + +/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * XXH128_cmp(): + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * return: >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 + */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be found in xxhash.c. + * + * However, code inlining requires the implementation to be visible to the + * compiler, usually within the header. + * + * As a workaround, xxhash.c used to be included within xxhash.h. This caused + * some issues with some build systems, especially ones which treat .c files + * as source files. + * + * Therefore, the implementation is now directly integrated within xxhash.h. + * Another small advantage is that xxhash.c is no longer needed in /include. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ +/*! + * XXH_FORCE_MEMORY_ACCESS: + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow to select a different access method for improved + * performance. + * Method 0 (default): + * Use `memcpy()`. Safe and portable. + * Method 1: + * `__attribute__((packed))` statement. It depends on compiler extensions + * and is therefore not portable. + * This method is safe if your compiler supports it, and *generally* as + * fast or faster than `memcpy`. + * Method 2: + * Direct access via cast. This method doesn't depend on the compiler but + * violates the C standard. + * It can generate buggy code on targets which do not support unaligned + * memory accesses. + * But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) + * Method 3: + * Byteshift. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. + * See https://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2 > 3) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*! + *XXH_ACCEPT_NULL_INPUT_POINTER: + * If the input pointer is NULL, xxHash's default behavior is to dereference it, + * triggering a segfault. + * When this macro is enabled, xxHash actively checks the input for a null pointer. + * If it is, the result for null input pointers is the same as a zero-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*! + * XXH_FORCE_ALIGN_CHECK: + * This is a minor performance trick, only useful with lots of very small keys. + * It means: check for aligned/unaligned input. + * The check costs one initial branch per hash; + * Set it to 0 when the input is guaranteed to be aligned or when alignment + * doesn't matter for performance. + * + * This option does not affect XXH3. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*! + * XXH_NO_INLINE_HINTS: + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ +#ifndef XXH_NO_INLINE_HINTS +# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ + || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +/*! + * XXH_REROLL: + * Whether to reroll XXH32_finalize, and XXH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. + */ +#ifndef XXH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXH_REROLL 1 +# else +# define XXH_REROLL 0 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! + * Modify the local functions below should you wish to use some other memory + * routines for malloc() and free() + */ +#include + +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void* p) { free(p); } + +/*! and for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) \ + || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXH_NO_INLINE static __attribute__((noinline)) +# else +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +# endif +# else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* + * DEBUGLEVEL is expected to be defined externally, typically via the compiler's + * command line options. The value must be a number. + */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + + +/* *** Memory access *** */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianess *** */ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/*! + * XXH_CPU_LITTLE_ENDIAN: + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, a runtime check (which is usually constant folded) + * is used instead. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +static int XXH_isLittleEndian(void) +{ + /* + * Nonstandard, but well-defined behavior in practice. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ + && __has_builtin(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= PRIME32_1; +#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * This inline assembly hack forces acc into a normal register. This is the + * only thing that prevents GCC and Clang from autovectorizing the XXH32 + * loop (pragmas and attributes don't work for some resason) without globally + * disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * How this hack works: + * __asm__("" // Declare an assembly block but don't declare any instructions + * : // However, as an Input/Output Operand, + * "+r" // constrain a read/write operand (+) as a general purpose register (r). + * (acc) // and set acc as the operand + * ); + * + * Because of the 'r', the compiler has promised that seed will be in a + * general purpose register and the '+' says that it will be 'read/write', + * so it has to assume it has changed. It is like volatile without all the + * loads and stores. + * + * Since the argument has to be in a normal register (not an SSE register), + * each time XXH32_round is called, it is impossible to vectorize. + */ + __asm__("" : "+r" (acc)); +#endif + return acc; +} + +/* mix all bits */ +static xxh_u32 XXH32_avalanche(xxh_u32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +static xxh_u32 +XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1 do { \ + h32 += (*ptr++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1; \ +} while (0) + +#define PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4; \ +} while (0) + + /* Compact rerolled version */ + if (XXH_REROLL) { + len &= 15; + while (len >= 4) { + PROCESS4; + len -= 4; + } + while (len > 0) { + PROCESS1; + --len; + } + return XXH32_avalanche(h32); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + XXH_ASSERT(0); + return h32; /* reaching this point is deemed impossible */ + } +} + +XXH_FORCE_INLINE xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)16; + } +#endif + + if (len>=16) { + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; + xxh_u32 v2 = seed + PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + xxh_u32 v1 = state->v1; + xxh_u32 v2 = state->v2; + xxh_u32 v3 = state->v3; + xxh_u32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + + +/*! + * XXH_REROLL_XXH64: + * Whether to reroll the XXH64_finalize() loop. + * + * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a + * performance gain on 64-bit hosts, as only one jump is required. + * + * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit + * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial + * to unroll. The code becomes ridiculously large (the largest function in the + * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is + * also slightly faster because it fits into cache better and is more likely + * to be inlined by the compiler. + * + * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. + */ +#ifndef XXH_REROLL_XXH64 +# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ + || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ + || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ + || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ + || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ + || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ +# define XXH_REROLL_XXH64 1 +# else +# define XXH_REROLL_XXH64 0 +# endif +#endif /* !defined(XXH_REROLL_XXH64) */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static xxh_u64 XXH64_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +static xxh_u64 +XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1_64 do { \ + h64 ^= (*ptr++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; \ +} while (0) + +#define PROCESS4_64 do { \ + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ + ptr += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; \ +} while (0) + +#define PROCESS8_64 do { \ + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ + ptr += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} while (0) + + /* Rerolled version for 32-bit targets is faster and much smaller. */ + if (XXH_REROLL || XXH_REROLL_XXH64) { + len &= 31; + while (len >= 8) { + PROCESS8_64; + len -= 8; + } + if (len >= 4) { + PROCESS4_64; + len -= 4; + } + while (len > 0) { + PROCESS1_64; + --len; + } + return XXH64_avalanche(h64); + } else { + switch(len & 31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + } + /* impossible to reach */ + XXH_ASSERT(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +XXH_FORCE_INLINE xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)32; + } +#endif + + if (len>=32) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; + xxh_u64 v2 = seed + PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved64, might be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = state->v1; + xxh_u64 v2 = state->v2; + xxh_u64 v3 = state->v3; + xxh_u64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + xxh_u64 const v1 = state->v1; + xxh_u64 const v2 = state->v2; + xxh_u64 const v3 = state->v3; + xxh_u64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} + + +/******* Canonical representation *******/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + + + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +#include "xxh3.h" + + +#endif /* XXH_NO_LONG_LONG */ + + +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} +#endif -- cgit v1.2.3 From e335a8ca7615c702cfa2dcdb71deb69468088fd8 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jun 2020 21:04:25 +0200 Subject: first steps in bringing over the JIT refactor/fastmem --- src/ARM.cpp | 43 +- src/ARM.h | 15 +- src/ARMJIT.cpp | 771 ++++++++++----------------------- src/ARMJIT.h | 64 +-- src/ARMJIT_A64/ARMJIT_ALU.cpp | 123 +++++- src/ARMJIT_A64/ARMJIT_Branch.cpp | 99 ++--- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 383 ++++++++++++----- src/ARMJIT_A64/ARMJIT_Compiler.h | 71 +++- src/ARMJIT_A64/ARMJIT_Linkage.s | 68 +++ src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 790 ++++++++++++++++------------------ src/ARMJIT_Compiler.h | 12 + src/ARMJIT_Internal.h | 70 +-- src/ARMJIT_Memory.cpp | 822 ++++++++++++++++++++++++++++++++++++ src/ARMJIT_Memory.h | 53 +++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 92 +--- src/ARMJIT_x64/ARMJIT_Compiler.h | 11 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 45 +- src/ARM_InstrInfo.cpp | 73 ++-- src/ARM_InstrInfo.h | 1 + src/CMakeLists.txt | 6 +- src/CP15.cpp | 84 ++-- src/Config.cpp | 6 +- src/Config.h | 1 + src/NDS.cpp | 220 +++++----- src/NDS.h | 17 +- 25 files changed, 2342 insertions(+), 1598 deletions(-) create mode 100644 src/ARMJIT_A64/ARMJIT_Linkage.s create mode 100644 src/ARMJIT_Compiler.h create mode 100644 src/ARMJIT_Memory.cpp create mode 100644 src/ARMJIT_Memory.h (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 92a3a9e..e529be8 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,6 +21,8 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" +#include "ARMJIT.h" +#include "Config.h" #include "AREngine.h" #include "ARMJIT.h" #include "Config.h" @@ -74,7 +76,9 @@ ARM::~ARM() ARMv5::ARMv5() : ARM(0) { - // +#ifndef JIT_ENABLED + DTCM = new u8[DTCMSize]; +#endif } ARMv4::ARMv4() : ARM(1) @@ -82,6 +86,13 @@ ARMv4::ARMv4() : ARM(1) // } +ARMv5::~ARMv5() +{ +#ifndef JIT_ENABLED + delete[] DTCM; +#endif +} + void ARM::Reset() { Cycles = 0; @@ -622,24 +633,26 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); - if (!translatedAddr) + + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); return; } - // hack so Cycles <= 0 becomes Cycles < 0 - Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); + NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1; if (StopExecution) { @@ -766,23 +779,25 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); - if (!translatedAddr) + + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); + NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1; // TODO optimize this shit!!! if (StopExecution) diff --git a/src/ARM.h b/src/ARM.h index b1e8053..b7f16d6 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -32,11 +32,14 @@ enum RWFlags_ForceUser = (1<<21), }; +const u32 ITCMPhysicalSize = 0x8000; +const u32 DTCMPhysicalSize = 0x4000; + class ARM { public: ARM(u32 num); - ~ARM(); // destroy shit + virtual ~ARM(); // destroy shit virtual void Reset(); @@ -143,6 +146,11 @@ public: NDS::MemRegion CodeMem; +#ifdef JIT_ENABLED + u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0; + u64* FastBlockLookup; +#endif + static u32 ConditionTable[16]; protected: @@ -158,6 +166,7 @@ class ARMv5 : public ARM { public: ARMv5(); + ~ARMv5(); void Reset(); @@ -260,8 +269,8 @@ public: u32 DTCMBase, DTCMSize; s32 RegionCodeCycles; - u8 ITCM[0x8000]; - u8 DTCM[0x4000]; + u8 ITCM[ITCMPhysicalSize]; + u8* DTCM; u8 ICache[0x2000]; u32 ICacheTags[64*4]; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 8d87c76..53b28c1 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -10,13 +10,8 @@ #include "Config.h" #include "ARMJIT_Internal.h" -#if defined(__x86_64__) -#include "ARMJIT_x64/ARMJIT_Compiler.h" -#elif defined(__aarch64__) -#include "ARMJIT_A64/ARMJIT_Compiler.h" -#else -#error "The current target platform doesn't have a JIT backend" -#endif +#include "ARMJIT_Memory.h" +#include "ARMJIT_Compiler.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" @@ -29,6 +24,11 @@ #include "Wifi.h" #include "NDSCart.h" +#include "ARMJIT_x64/ARMJIT_Offsets.h" +static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset); +static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset); +static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset); + namespace ARMJIT { @@ -37,281 +37,100 @@ namespace ARMJIT Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = -{ - 0x8000, // Unmapped Region (dummy) - 0x8000, // ITCM - 4*1024*1024, // Main RAM - 0x8000, // SWRAM - 0xA4000, // LCDC - 0x8000, // ARM9 BIOS - 0x4000, // ARM7 BIOS - 0x10000, // ARM7 WRAM - 0x40000 // ARM7 WVRAM -}; - -const u32 ExeMemRegionOffsets[] = -{ - 0, - 0x8000, - 0x10000, - 0x410000, - 0x418000, - 0x4BC000, - 0x4C4000, - 0x4C8000, - 0x4D8000, - 0x518000, -}; - -/* - translates address to pseudo physical address - - more compact, eliminates mirroring, everything comes in a row - - we only need one translation table -*/ - -u32 TranslateAddr9(u32 addr) -{ - switch (ClassifyAddress9(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM9: - if (NDS::SWRAM_ARM9) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); - else - return 0; - case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); - case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; - case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); - default: return 0; - } -} - -u32 TranslateAddr7(u32 addr) -{ - switch (ClassifyAddress7(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM7: - if (NDS::SWRAM_ARM7) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); - else - return 0; - case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; - case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); - case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); - default: return 0; - } -} - -AddressRange CodeRanges[ExeMemSpaceSize / 512]; - -TinyVector InvalidLiterals; +AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; +AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; +AddressRange CodeIndexVRAM[0x100000 / 512]; +AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; +AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; +AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; +AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; std::unordered_map JitBlocks9; std::unordered_map JitBlocks7; -u8 MemoryStatus9[0x800000]; -u8 MemoryStatus7[0x800000]; +u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; +u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; +u64 FastBlockLookupVRAM[0x100000 / 2]; +u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; +u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; +u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; +u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; -int ClassifyAddress9(u32 addr) +const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { - if (addr < NDS::ARM9->ITCMSize) - return memregion_ITCM; - else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - return memregion_DTCM; - else if ((addr & 0xFFFFF000) == 0xFFFF0000) - return memregion_BIOS9; - else - { - switch (addr & 0xFF000000) - { - case 0x02000000: - return memregion_MainRAM; - case 0x03000000: - return memregion_SWRAM9; - case 0x04000000: - return memregion_IO9; - case 0x06000000: - return memregion_VRAM; - } - } - return memregion_Other; -} + 0, + ITCMPhysicalSize, + 0, + sizeof(NDS::ARM9BIOS), + NDS::MainRAMSize, + NDS::SharedWRAMSize, + 0, + 0x100000, + sizeof(NDS::ARM7BIOS), + NDS::ARM7WRAMSize, + 0, + 0, + 0x40000, +}; -int ClassifyAddress7(u32 addr) +AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = { - if (addr < 0x00004000) - return memregion_BIOS7; - else - { - switch (addr & 0xFF800000) - { - case 0x02000000: - case 0x02800000: - return memregion_MainRAM; - case 0x03000000: - if (NDS::SWRAM_ARM7) - return memregion_SWRAM7; - else - return memregion_WRAM7; - case 0x03800000: - return memregion_WRAM7; - case 0x04000000: - return memregion_IO7; - case 0x04800000: - return memregion_Wifi; - case 0x06000000: - case 0x06800000: - return memregion_VWRAM; - } - } - return memregion_Other; -} + NULL, + CodeIndexITCM, + NULL, + CodeIndexARM9BIOS, + CodeIndexMainRAM, + CodeIndexSWRAM, + NULL, + CodeIndexVRAM, + CodeIndexARM7BIOS, + CodeIndexARM7WRAM, + NULL, + NULL, + CodeIndexARM7WVRAM, +}; -void UpdateMemoryStatus9(u32 start, u32 end) +u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) - { - u32 addr = i << 12; - - int region = ClassifyAddress9(addr); - u32 pseudoPhyisical = TranslateAddr9(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus9[i * 8 + j] = val; - } - } -} + NULL, + FastBlockLookupITCM, + NULL, + FastBlockLookupARM9BIOS, + FastBlockLookupMainRAM, + FastBlockLookupSWRAM, + NULL, + FastBlockLookupVRAM, + FastBlockLookupARM7BIOS, + FastBlockLookupARM7WRAM, + NULL, + NULL, + FastBlockLookupARM7WVRAM +}; -void UpdateMemoryStatus7(u32 start, u32 end) +u32 LocaliseCodeAddress(u32 num, u32 addr) { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addr) + : ARMJIT_Memory::ClassifyAddress7(addr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize) + && CodeMemRegions[region]) { - u32 addr = i << 12; - - int region = ClassifyAddress7(addr); - u32 pseudoPhyisical = TranslateAddr7(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus7[i * 8 + j] = val; - } + addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + addr |= (u32)region << 28; + return addr; } + return 0; } -void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) -{ - for (u32 i = 1; i < exeMem_Count; i++) - { - if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) - { - for (u32 num = 0; num < 2; num++) - { - u32 physSize = ExeMemRegionSizes[i]; - u32 mapSize = 0; - u32 mapStart = 0; - switch (i) - { - case exeMem_ITCM: - if (num == 0) - mapStart = 0; mapSize = NDS::ARM9->ITCMSize; - break; - case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; - case exeMem_SWRAM: - if (num == 0) - { - if (NDS::SWRAM_ARM9) - mapStart = 0x3000000, mapSize = 0x1000000; - else - mapStart = mapSize = 0; - } - else - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3000000, mapSize = 0x800000; - else - mapStart = mapSize = 0; - } - break; - case exeMem_LCDC: - if (num == 0) - mapStart = 0x6800000, mapSize = 0xA4000; - break; - case exeMem_ARM9_BIOS: - if (num == 0) - mapStart = 0xFFFF0000, mapSize = 0x10000; - break; - case exeMem_ARM7_BIOS: - if (num == 1) - mapStart = 0; mapSize = 0x4000; - break; - case exeMem_ARM7_WRAM: - if (num == 1) - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3800000, mapSize = 0x800000; - else - mapStart = 0x3000000, mapSize = 0x1000000; - } - break; - case exeMem_ARM7_WVRAM: - if (num == 1) - mapStart = 0x6000000, mapSize = 0x1000000; - break; - } - - for (u32 j = 0; j < mapSize / physSize; j++) - { - u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); - if (num == 0 - && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - continue; - if (invalidate) - { - if (num == 0) - MemoryStatus9[virtAddr / 512] |= 0x80; - else - MemoryStatus7[virtAddr / 512] |= 0x80; - } - else - { - if (num == 0) - MemoryStatus9[virtAddr / 512] &= ~0x80; - else - MemoryStatus7[virtAddr / 512] &= ~0x80; - } - } - - } - return; - } - } - - assert(false); -} +TinyVector InvalidLiterals; template -T SlowRead9(ARMv5* cpu, u32 addr) +T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; addr &= ~(sizeof(T) - 1); @@ -335,13 +154,13 @@ T SlowRead9(ARMv5* cpu, u32 addr) } template -void SlowWrite9(ARMv5* cpu, u32 addr, T val) +void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); if (addr < cpu->ITCMSize) { - InvalidateITCMIfNecessary(addr); + CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); *(T*)&cpu->ITCM[addr & 0x7FFF] = val; } else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) @@ -362,13 +181,13 @@ void SlowWrite9(ARMv5* cpu, u32 addr, T val) } } -template void SlowWrite9(ARMv5*, u32, u32); -template void SlowWrite9(ARMv5*, u32, u16); -template void SlowWrite9(ARMv5*, u32, u8); +template void SlowWrite9(u32, ARMv5*, u32); +template void SlowWrite9(u32, ARMv5*, u16); +template void SlowWrite9(u32, ARMv5*, u8); -template u32 SlowRead9(ARMv5*, u32); -template u16 SlowRead9(ARMv5*, u32); -template u8 SlowRead9(ARMv5*, u32); +template u32 SlowRead9(u32, ARMv5*); +template u16 SlowRead9(u32, ARMv5*); +template u8 SlowRead9(u32, ARMv5*); template T SlowRead7(u32 addr) @@ -407,14 +226,15 @@ template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) - SlowWrite9(cpu, addr, data[i]); + SlowWrite9(addr, cpu, data[i]); else - data[i] = SlowRead9(cpu, addr); - addr += !PreInc * 4; + data[i] = SlowRead9(addr, cpu); + addr += 4; } } @@ -422,14 +242,15 @@ template void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) SlowWrite7(addr, data[i]); else data[i] = SlowRead7(addr); - addr += !PreInc * 4; + addr += 4; } } @@ -540,16 +361,18 @@ struct UnreliableHashTable }; UnreliableHashTable RestoreCandidates; -UnreliableHashTable FastBlockLookUp9; -UnreliableHashTable FastBlockLookUp7; void Init() { JITCompiler = new Compiler(); + + ARMJIT_Memory::Init(); } void DeInit() { + ARMJIT_Memory::DeInit(); + delete JITCompiler; } @@ -557,8 +380,7 @@ void Reset() { ResetBlockCache(); - UpdateMemoryStatus9(0, 0xFFFFFFFF); - UpdateMemoryStatus7(0, 0xFFFFFFFF); + ARMJIT_Memory::Reset(); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -673,11 +495,12 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) // it basically checks if one iteration of a loop depends on another // the rules are quite simple + JIT_DEBUGPRINT("checking potential idle loop\n"); u16 regsWrittenTo = 0; u16 regsDisallowedToWrite = 0; for (int i = 0; i < instrsCount; i++) { - //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) return false; if (i < instrsCount - 1 && instrs[i].Info.Branches()) @@ -782,8 +605,6 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F - -extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -794,14 +615,28 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr9(blockAddr) - : TranslateAddr7(blockAddr); - if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) - { - printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); - } - + + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; + auto existingBlockIt = map.find(blockAddr); + if (existingBlockIt != map.end()) + { + // there's already a block, though it's not inside the fast map + // could be that there are two blocks at the same physical addr + // but different mirrors + u32 localAddr = existingBlockIt->second->StartAddrLocal; + + u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; @@ -842,9 +677,8 @@ void CompileBlock(ARM* cpu) instrValues[i] = instrs[i].Instr; - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(instrs[i].Addr) - : TranslateAddr7(instrs[i].Addr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); + assert(translatedAddr); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { @@ -928,9 +762,11 @@ void CompileBlock(ARM* cpu) && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral && DecodeLiteral(thumb, instrs[i], literalAddr)) { - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(literalAddr) - : TranslateAddr7(literalAddr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr); + if (!translatedAddr) + { + printf("literal in non executable memory?\n"); + } u32 translatedAddrRounded = translatedAddr & ~0x1FF; u32 j = 0; @@ -994,9 +830,7 @@ void CompileBlock(ARM* cpu) } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { - u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr9(target) - : TranslateAddr7(target); + u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); if (link) { @@ -1048,7 +882,7 @@ void CompileBlock(ARM* cpu) { RestoreCandidates.Remove(instrHash); - mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash; + mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash; if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { @@ -1087,11 +921,12 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numLiterals; j++) block->Literals()[j] = literalLoadAddrs[j]; - block->PseudoPhysicalAddr = pseudoPhysicalAddr; + block->StartAddr = blockAddr; + block->StartAddrLocal = localAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); } else { @@ -1104,30 +939,34 @@ void CompileBlock(ARM* cpu) assert(addressRanges[j] == block->AddressRanges()[j]); assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); - CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; - CodeRanges[addressRanges[j] / 512].Blocks.Add(block); - UpdateRegionByPseudoPhyiscal(addressRanges[j], true); + AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; + + if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); + + AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; + range->Code |= addressMasks[j]; + range->Blocks.Add(block); } if (cpu->Num == 0) - { - JitBlocks9[pseudoPhysicalAddr] = block; - FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks9[blockAddr] = block; else - { - JitBlocks7[pseudoPhysicalAddr] = block; - FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks7[blockAddr] = block; + + u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } -void InvalidateByAddr(u32 pseudoPhysical) +void InvalidateByAddr(u32 localAddr) { - JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); - AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + AddressRange* region = CodeMemRegions[localAddr >> 28]; + AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; + u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; for (int i = 0; i < range->Blocks.Length;) @@ -1138,7 +977,7 @@ void InvalidateByAddr(u32 pseudoPhysical) u32 mask = 0; for (int j = 0; j < block->NumAddresses; j++) { - if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + if (block->AddressRanges()[j] == (localAddr & ~0x1FF)) { mask = block->AddressMasks()[j]; invalidated = block->AddressMasks()[j] & mask; @@ -1154,15 +993,21 @@ void InvalidateByAddr(u32 pseudoPhysical) } range->Blocks.Remove(i); + if (range->Blocks.Length == 0 + && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) + { + ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); + } + bool literalInvalidation = false; for (int j = 0; j < block->NumLiterals; j++) { u32 addr = block->Literals()[j]; - if (addr == pseudoPhysical) + if (addr == localAddr) { - if (InvalidLiterals.Find(pseudoPhysical) != -1) + if (InvalidLiterals.Find(localAddr) != -1) { - InvalidLiterals.Add(pseudoPhysical); + InvalidLiterals.Add(localAddr); JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); } literalInvalidation = true; @@ -1172,35 +1017,30 @@ void InvalidateByAddr(u32 pseudoPhysical) for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - if ((addr / 512) != (pseudoPhysical / 512)) + if ((addr / 512) != (localAddr / 512)) { - AddressRange* otherRange = &CodeRanges[addr / 512]; + AddressRange* otherRegion = CodeMemRegions[addr >> 28]; + AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; assert(otherRange != range); + bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); if (otherRange->Blocks.Length == 0) { + if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); + otherRange->Code = 0; - UpdateRegionByPseudoPhyiscal(addr, false); } } } - for (int j = 0; j < block->NumLinks(); j++) - JITCompiler->UnlinkBlock(block->Links()[j]); - block->ResetLinks(); - + FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) - { - JitBlocks9.erase(block->PseudoPhysicalAddr); - FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); - } + JitBlocks9.erase(block->StartAddr); else - { - JitBlocks7.erase(block->PseudoPhysicalAddr); - FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); - } + JitBlocks7.erase(block->StartAddr); if (!literalInvalidation) { @@ -1213,24 +1053,66 @@ void InvalidateByAddr(u32 pseudoPhysical) delete block; } } +} - if (range->Blocks.Length == 0) - UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); +template +void CheckAndInvalidate(u32 addr) +{ + // let's hope this gets all properly inlined + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) + { + u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr | (region << 28)); + } +} + +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) +{ + u64* entry = &entries[offset / 2]; + if (*entry >> 32 == (addr | num)) + return JITCompiler->AddEntryOffset((u32)*entry); + return NULL; } -void InvalidateRegionIfNecessary(u32 pseudoPhyisical) +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { - if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) - InvalidateByAddr(pseudoPhyisical); + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(blockAddr) + : ARMJIT_Memory::ClassifyAddress7(blockAddr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (CodeMemRegions[region] + && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize)) + { + entry = FastBlockLookupRegions[region] + memoryOffset / 2; + // evil, though it should work for everything except DTCM which is not relevant here + start = blockAddr & ~(memorySize - 1); + size = memorySize; + return true; + } + else + return false; } +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); + void ResetBlockCache() { printf("Resetting JIT block cache...\n"); InvalidLiterals.Clear(); - FastBlockLookUp9.Reset(); - FastBlockLookUp7.Reset(); + for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++) + memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -1251,8 +1133,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } delete block; } @@ -1262,8 +1145,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } } JitBlocks9.clear(); @@ -1272,191 +1156,4 @@ void ResetBlockCache() JITCompiler->Reset(); } -template -JitBlockEntry LookUpBlockEntry(u32 addr) -{ - auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; - u32 entryOffset = fastMap.LookUp(addr); - if (entryOffset != UINT32_MAX) - return JITCompiler->AddEntryOffset(entryOffset); - - auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; - auto block = slowMap.find(addr); - if (block != slowMap.end()) - { - fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); - return block->second->EntryPoint; - } - return NULL; -} - -template JitBlockEntry LookUpBlockEntry<0>(u32); -template JitBlockEntry LookUpBlockEntry<1>(u32); - -template -void LinkBlock(ARM* cpu, u32 codeOffset) -{ - auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; - u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); - u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); - auto block = blockMap.find(targetPseudoPhys); - if (block == blockMap.end()) - { - CompileBlock(cpu); - block = blockMap.find(targetPseudoPhys); - } - - JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); - - block->second->AddLink(codeOffset); - JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); -} - -template void LinkBlock<0>(ARM*, u32); -template void LinkBlock<1>(ARM*, u32); - -void WifiWrite32(u32 addr, u32 val) -{ - Wifi::Write(addr, val & 0xFFFF); - Wifi::Write(addr + 2, val >> 16); -} - -u32 WifiRead32(u32 addr) -{ - return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); -} - -template -void VRAMWrite(u32 addr, T val) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; - case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; - case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; - case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; - } -} -template -T VRAMRead(u32 addr) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: return GPU::ReadVRAM_ABG(addr); - case 0x00200000: return GPU::ReadVRAM_BBG(addr); - case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); - case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); - default: return GPU::ReadVRAM_LCDC(addr); - } -} - -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) -{ - if (cpu->Num == 0) - { - switch (addr & 0xFF000000) - { - case 0x04000000: - if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) - return (void*)NDSCart::ReadROMData; - - /* - unfortunately we can't map GPU2D this way - since it's hidden inside an object - - though GPU3D registers are accessed much more intensive - */ - if (addr >= 0x04000320 && addr < 0x040006A4) - { - switch (size | store) - { - case 8: return (void*)GPU3D::Read8; - case 9: return (void*)GPU3D::Write8; - case 16: return (void*)GPU3D::Read16; - case 17: return (void*)GPU3D::Write16; - case 32: return (void*)GPU3D::Read32; - case 33: return (void*)GPU3D::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; - case 16: return (void*)NDS::ARM9IORead16; - case 17: return (void*)NDS::ARM9IOWrite16; - case 32: return (void*)NDS::ARM9IORead32; - case 33: return (void*)NDS::ARM9IOWrite32; - } - break; - case 0x06000000: - switch (size | store) - { - case 8: return (void*)VRAMRead; - case 9: return NULL; - case 16: return (void*)VRAMRead; - case 17: return (void*)VRAMWrite; - case 32: return (void*)VRAMRead; - case 33: return (void*)VRAMWrite; - } - break; - } - } - else - { - switch (addr & 0xFF800000) - { - case 0x04000000: - if (addr >= 0x04000400 && addr < 0x04000520) - { - switch (size | store) - { - case 8: return (void*)SPU::Read8; - case 9: return (void*)SPU::Write8; - case 16: return (void*)SPU::Read16; - case 17: return (void*)SPU::Write16; - case 32: return (void*)SPU::Read32; - case 33: return (void*)SPU::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM7IORead8; - case 9: return (void*)NDS::ARM7IOWrite8; - case 16: return (void*)NDS::ARM7IORead16; - case 17: return (void*)NDS::ARM7IOWrite16; - case 32: return (void*)NDS::ARM7IORead32; - case 33: return (void*)NDS::ARM7IOWrite32; - } - break; - case 0x04800000: - if (addr < 0x04810000 && size >= 16) - { - switch (size | store) - { - case 16: return (void*)Wifi::Read; - case 17: return (void*)Wifi::Write; - case 32: return (void*)WifiRead32; - case 33: return (void*)WifiWrite32; - } - } - break; - case 0x06000000: - case 0x06800000: - switch (size | store) - { - case 8: return (void*)GPU::ReadVRAM_ARM7; - case 9: return (void*)GPU::WriteVRAM_ARM7; - case 16: return (void*)GPU::ReadVRAM_ARM7; - case 17: return (void*)GPU::WriteVRAM_ARM7; - case 32: return (void*)GPU::ReadVRAM_ARM7; - case 33: return (void*)GPU::WriteVRAM_ARM7; - } - } - } - return NULL; -} - } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 44a6140..2320b7b 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,32 +9,7 @@ namespace ARMJIT { -enum ExeMemKind -{ - exeMem_Unmapped = 0, - exeMem_ITCM, - exeMem_MainRAM, - exeMem_SWRAM, - exeMem_LCDC, - exeMem_ARM9_BIOS, - exeMem_ARM7_BIOS, - exeMem_ARM7_WRAM, - exeMem_ARM7_WVRAM, - exeMem_Count -}; - -extern const u32 ExeMemRegionOffsets[]; -extern const u32 ExeMemRegionSizes[]; - -typedef u32 (*JitBlockEntry)(); - -const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... - -u32 TranslateAddr9(u32 addr); -u32 TranslateAddr7(u32 addr); - -template -JitBlockEntry LookUpBlockEntry(u32 addr); +typedef void (*JitBlockEntry)(); void Init(); void DeInit(); @@ -43,44 +18,15 @@ void Reset(); void InvalidateByAddr(u32 pseudoPhysical); -void InvalidateRegionIfNecessary(u32 addr); - -inline void InvalidateMainRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); -} -inline void InvalidateITCMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); -} -inline void InvalidateLCDCIfNecessary(u32 addr) -{ - if (addr < 0x68A3FFF) - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); -} -inline void InvalidateSWRAM7IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); -} -inline void InvalidateSWRAM9IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); -} -inline void InvalidateARM7WRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); -} -inline void InvalidateARM7WVRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); -} +template +void CheckAndInvalidate(u32 addr); void CompileBlock(ARM* cpu); void ResetBlockCache(); -void UpdateMemoryStatus9(u32 start, u32 end); -void UpdateMemoryStatus7(u32 start, u32 end); +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr); +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size); } diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp index 0fe6a97..5f021a0 100644 --- a/src/ARMJIT_A64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp @@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S && !CurInstr.SetFlags) S = false; - bool CVInGP = false; + bool CVInGPR = false; switch (op) { case 0x2: // SUB @@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 UBFX(W2, RCPSR, 29, 1); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, rn, W2); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption()); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 MVN(W1, rn); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S) { - if (CVInGP) + if (CVInGPR) { BFI(RCPSR, W2, 29, 1); BFI(RCPSR, W3, 28, 1); } - Comp_RetriveFlags(!CVInGP); + Comp_RetriveFlags(!CVInGPR); } } @@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp() MOVI2R(rd, op2.Imm); } else - MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + { + // ORR with shifted operand has cycles latency + if (op2.Reg.ShiftAmount > 0) + { + switch (op2.Reg.ShiftType) + { + case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + } + } + else + { + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + } + } } if (S) @@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + CLS(W0, rs); Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long() } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + if (sign) + CLS(W0, rs); + else + CLZ(W0, rs); Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long() Comp_RetriveFlags(false); } +void Compiler::A_Comp_Mul_Short() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool x = CurInstr.Instr & (1 << 5); + bool y = CurInstr.Instr & (1 << 6); + + SBFX(W1, rs, y ? 16 : 0, 16); + + if (op == 0b1000) + { + // SMLAxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(W0, W0, W1); + + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + + Comp_AddCycles_C(); + } + else if (op == 0b1011) + { + // SMULxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(rd, W0, W1); + + Comp_AddCycles_C(); + } + else if (op == 0b1010) + { + // SMLALxy + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + MOV(W2, rn); + BFI(X2, rd, 32, 32); + + SBFX(W0, rm, x ? 16 : 0, 16); + + SMADDL(EncodeRegTo64(rn), W0, W1, X2); + + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + + Comp_AddCycles_CI(1); + } + else if (op == 0b1001) + { + // SMLAWy/SMULWy + SMULL(X0, rm, W1); + ASR(x ? EncodeRegTo64(rd) : X0, X0, 16); + + if (!x) + { + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + } + + Comp_AddCycles_C(); + } +} + void Compiler::A_Comp_Mul() { ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index 542f0b7..f130938 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } @@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind) AlignCode16(); void* res = GetRXPtr(); - MOVI2R(W2, kCodeCacheTiming); - // W1 - code cycles non branch - // W2 - branch code cycles LSR(W1, W0, 12); - LSL(W1, W1, 2); ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2); LDRB(W1, RCPU, W1); - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); + LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize)); STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles)); - CMP(W0, W3); - FixupBranch outsideITCM = B(CC_LO); - MOVI2R(W1, 1); - MOVI2R(W2, 1); - SetJumpTarget(outsideITCM); + CMP(W1, 0xFF); + MOVI2R(W3, kCodeCacheTiming); + CSEL(W1, W3, W1, CC_EQ); + CMP(W0, W2); + CSINC(W1, W1, WZR, CC_HS); FixupBranch switchToThumb; if (kind == 0) @@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind) if (kind == 0 || kind == 1) { - ANDI2R(W0, W0, ~3); - + // ARM if (kind == 0) ANDI2R(RCPSR, RCPSR, ~0x20); - ADD(W3, W0, 4); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); + ANDI2R(W0, W0, ~3); + ADD(W0, W0, 4); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); + ADD(W1, W1, W1); + SUB(RCycles, RCycles, W1); RET(); } + if (kind == 0 || kind == 2) { + // Thumb if (kind == 0) { SetJumpTarget(switchToThumb); - ORRI2R(RCPSR, RCPSR, 0x20); } ANDI2R(W0, W0, ~1); + ADD(W0, W0, 2); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); - ADD(W3, W0, 2); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - FixupBranch halfwordLoc = TBZ(W0, 1); - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); - RET(); - - SetJumpTarget(halfwordLoc); - ADD(RCycles, RCycles, W2); + ADD(W2, W1, W1); + TSTI2R(W0, 0x2); + CSEL(W1, W1, W2, CC_EQ); + SUB(RCycles, RCycles, W1); RET(); } @@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 0, 8); UBFX(W3, W3, 8, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~3); @@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 16, 8); UBFX(W3, W3, 24, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~1); @@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto } else { - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); - bool previouslyDirty = CPSRDirty; + + bool cpsrDirty = CPSRDirty; SaveCPSR(); - - if (restoreCPSR) - { - if (Thumb || CurInstr.Cond() >= 0xE) - RegCache.Flush(); - else - { - // the ugly way... - // we only save them, to load and save them again - for (int reg : hiRegsLoaded) - SaveReg(reg, RegCache.Mapping[reg]); - } - } + SaveCycles(); + PushRegs(restoreCPSR); if (switchThumb) MOV(W1, addr); @@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto QuickCallFunction(X3, jumpToTrampoline); else QuickCallFunction(X3, jumpToTrampoline); - - if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) - { - for (int reg : hiRegsLoaded) - LoadReg(reg, RegCache.Mapping[reg]); - } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + PopRegs(restoreCPSR); + LoadCycles(); + LoadCPSR(); + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } } @@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); FixupBranch skipFailed = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipFailed); } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index a67f357..42435ed 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -1,9 +1,3 @@ -#include "ARMJIT_Compiler.h" - -#include "../ARMInterpreter.h" - -#include "../ARMJIT_Internal.h" - #ifdef __SWITCH__ #include "../switch/compat_switch.h" @@ -13,10 +7,17 @@ extern char __start__; #include #endif +#include "ARMJIT_Compiler.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMInterpreter.h" +#include "../Config.h" + #include using namespace Arm64Gen; +extern "C" void ARM_Ret(); namespace ARMJIT { @@ -28,7 +29,10 @@ namespace ARMJIT like x64. At one hand you can translate a lot of instructions directly. But at the same time, there are a ton of exceptions, like for example ADD and SUB can't have a RORed second operand on ARMv8. - */ + + While writing a JIT when an instruction is recompiled into multiple ones + not to write back until you've read all the other operands! +*/ template <> const ARM64Reg RegisterCache::NativeRegAllocOrder[] = @@ -46,6 +50,132 @@ void Compiler::MovePC() ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); } +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + MOV(rd, W3); + } + else + MOV(rd, RCPSR); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + ARM64Reg val; + if (CurInstr.Instr & (1 << 25)) + { + val = W0; + MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))); + } + else + { + val = MapReg(CurInstr.A_Reg(0)); + } + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + + MOVI2R(W1, mask); + MOVI2R(W2, mask & 0xFFFFFF00); + ANDI2R(W5, RCPSR, 0x1F); + CMP(W5, 0x10); + CSEL(W1, W2, W1, CC_EQ); + + BIC(W3, W3, W1); + AND(W0, val, W1); + ORR(W3, W3, W0); + + MOVI2R(W1, 15 - 8); + + BL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + ANDI2R(RCPSR, RCPSR, ~mask); + ANDI2R(W0, val, mask); + ORR(RCPSR, RCPSR, W0); + } + else + { + MOVI2R(W2, mask); + MOVI2R(W3, mask & 0xFFFFFF00); + ANDI2R(W1, RCPSR, 0x1F); + // W1 = first argument + CMP(W1, 0x10); + CSEL(W2, W3, W2, CC_EQ); + + BIC(RCPSR, RCPSR, W2); + AND(W0, val, W2); + ORR(RCPSR, RCPSR, W0); + + MOV(W2, RCPSR); + MOV(X0, RCPU); + + PushRegs(true); + + QuickCallFunction(X3, (void*)&ARM::UpdateMode); + + PopRegs(true); + } + } +} + +void Compiler::PushRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + if (Thumb || CurInstr.Cond() == 0xE) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + RegCache.UnloadRegister(reg); + } + else + { + BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsDirty) + SaveReg(reg, RegCache.Mapping[reg]); + } + } +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } +} + Compiler::Compiler() { #ifdef __SWITCH__ @@ -80,8 +210,7 @@ Compiler::Compiler() assert(succeded); SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); - JitMemUseableSize = JitMemSize; - Reset(); + JitMemMainSize = JitMemSize; #else u64 pageSize = sysconf(_SC_PAGE_SIZE); u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); @@ -90,31 +219,8 @@ Compiler::Compiler() SetCodeBase(pageAligned, pageAligned); JitMemUseableSize = alignedSize; - Reset(); #endif - - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - { - MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j); - } - } - MemFunc7[0][0] = (void*)NDS::ARM7Read8; - MemFunc7[1][0] = (void*)NDS::ARM7Read16; - MemFunc7[2][0] = (void*)NDS::ARM7Read32; - MemFunc7[0][1] = (void*)NDS::ARM7Write8; - MemFunc7[1][1] = (void*)NDS::ARM7Write16; - MemFunc7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - { - for (int j = 0; j < 2; j++) - { - MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j); - MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j); - } - } + SetCodePtr(0); for (int i = 0; i < 3; i++) { @@ -123,26 +229,26 @@ Compiler::Compiler() } /* - W0 - mode + W5 - mode W1 - reg num W3 - in/out value of reg */ { ReadBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); RET(); @@ -166,19 +272,19 @@ Compiler::Compiler() { WriteBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); MOVI2R(W4, 0); @@ -206,9 +312,71 @@ Compiler::Compiler() RET(); } - //FlushIcache(); + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 8; reg++) + { + ARM64Reg rdMapped = (ARM64Reg)(W19 + reg); + PatchedStoreFuncs[num][size][reg] = GetRXPtr(); + if (num == 0) + { + MOV(X1, RCPU); + MOV(W2, rdMapped); + } + else + { + MOV(W1, rdMapped); + } + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowWrite9); break; + case 33: QuickCallFunction(X3, SlowWrite7); break; + case 16: QuickCallFunction(X3, SlowWrite9); break; + case 17: QuickCallFunction(X3, SlowWrite7); break; + case 8: QuickCallFunction(X3, SlowWrite9); break; + case 9: QuickCallFunction(X3, SlowWrite7); break; + } + ABI_PopRegisters({30}); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr(); + if (num == 0) + MOV(X1, RCPU); + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowRead9); break; + case 33: QuickCallFunction(X3, SlowRead7); break; + case 16: QuickCallFunction(X3, SlowRead9); break; + case 17: QuickCallFunction(X3, SlowRead7); break; + case 8: QuickCallFunction(X3, SlowRead9); break; + case 9: QuickCallFunction(X3, SlowRead7); break; + } + ABI_PopRegisters({30}); + if (size == 32) + MOV(rdMapped, W0); + else if (signextend) + SBFX(rdMapped, W0, 0, 8 << size); + else + UBFX(rdMapped, W0, 0, 8 << size); + RET(); + } + } + } + } + + FlushIcache(); + + JitMemSecondarySize = 1024*1024*4; + + JitMemMainSize -= GetCodeOffset(); + JitMemMainSize -= JitMemSecondarySize; - JitMemUseableSize -= GetCodeOffset(); SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr()); } @@ -227,6 +395,16 @@ Compiler::~Compiler() #endif } +void Compiler::LoadCycles() +{ + LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + +void Compiler::SaveCycles() +{ + STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + void Compiler::LoadReg(int reg, ARM64Reg nativeReg) { if (reg == 15) @@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // CMN F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), // Mul - F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, + F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), // ARMv5 exclusives F(Clz), NULL, NULL, NULL, NULL, @@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // Branch F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg), // Special - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL, &Compiler::Nop }; #undef F @@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind) return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; } -void Compiler::Comp_BranchSpecialBehaviour() +void Compiler::Comp_BranchSpecialBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) + if (taken && CurInstr.BranchFlags & branch_IdleBranch) { MOVI2R(W0, 1); STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop)); } - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { - SaveCPSR(false); RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); + + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); } } JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (JitMemUseableSize - GetCodeOffset() < 1024 * 16) + if (JitMemMainSize - GetCodeOffset() < 1024 * 16) + { + printf("JIT near memory full, resetting...\n"); + ResetBlockCache(); + } + if ((JitMemMainSize + JitMemSecondarySize) - OtherCodeRegion < 1024 * 8) { - printf("JIT memory full, resetting...\n"); + printf("JIT far memory full, resetting...\n"); ResetBlockCache(); } @@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] CurCPU = cpu; ConstantCycles = 0; RegCache = RegisterCache(this, instrs, instrsCount, true); - - //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4)); - const u32 ALL_CALLEE_SAVED = 0x7FF80000; - - SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED)); - - //if (Num == 1) - { - ABI_PushRegisters(SavedRegs); - - MOVP2R(RCPU, CurCPU); - MOVI2R(RCycles, 0); - - LoadCPSR(); - } + CPSRDirty = false; for (int i = 0; i < instrsCount; i++) { @@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] if (comp == NULL) { + SaveCycles(); SaveCPSR(); RegCache.Flush(); } @@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] (this->*comp)(); } - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); if (cond < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipNop = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipNop); } @@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } if (comp == NULL) + { + LoadCycles(); LoadCPSR(); + } } RegCache.Flush(); - //if (Num == 1) - { - SaveCPSR(); - - ADD(W0, RCycles, ConstantCycles); - - ABI_PopRegisters(SavedRegs); - } - //else - // ADD(RCycles, RCycles, ConstantCycles); - - RET(); + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); FlushIcache(); - //printf("finished\n"); - return res; } void Compiler::Reset() { + LoadStorePatches.clear(); + SetCodePtr(0); + OtherCodeRegion = JitMemMainSize; const u32 brk_0 = 0xD4200000; - for (int i = 0; i < JitMemUseableSize / 4; i++) + for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++) *(((u32*)GetRWPtr()) + i) = brk_0; } -void Compiler::Comp_AddCycles_C(bool nonConst) +void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (!nonConst && !CurInstr.Info.Branches()) + if (forceNonConstant) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 numI) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI; - if (Thumb || CurInstr.Cond() >= 0xE) + if (Thumb || CurInstr.Cond() == 0xE) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c; - ADD(RCycles, RCycles, numI, shift); + SUB(RCycles, RCycles, cycles); if (Thumb || CurInstr.Cond() >= 0xE) - ConstantCycles += c; + ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CDI() @@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } @@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD() } if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index 5c9ef41..e4ffc63 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -9,6 +9,8 @@ #include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" +#include + namespace ARMJIT { @@ -64,7 +66,14 @@ struct Op2 }; }; -class Compiler : Arm64Gen::ARM64XEmitter +struct LoadStorePatch +{ + void* PatchFunc; + s32 PatchOffset; + u32 PatchSize; +}; + +class Compiler : public Arm64Gen::ARM64XEmitter { public: typedef void (Compiler::*CompileFunc)(); @@ -72,6 +81,9 @@ public: Compiler(); ~Compiler(); + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + Arm64Gen::ARM64Reg MapReg(int reg) { assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG); @@ -89,7 +101,7 @@ public: void Reset(); - void Comp_AddCycles_C(bool forceNonConst = false); + void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 numI); void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift); void Comp_AddCycles_CD(); @@ -103,6 +115,9 @@ public: void LoadCPSR(); void SaveCPSR(bool markClean = true); + void LoadCycles(); + void SaveCycles(); + void Nop() {} void A_Comp_ALUTriOp(); @@ -111,6 +126,7 @@ public: void A_Comp_Mul(); void A_Comp_Mul_Long(); + void A_Comp_Mul_Short(); void A_Comp_Clz(); @@ -122,6 +138,8 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void A_Comp_MRS(); + void A_Comp_MSR(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -168,7 +186,7 @@ public: void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0); void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); - void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); enum { memop_Writeback = 1 << 0, @@ -179,16 +197,33 @@ public: }; void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags); - void* Gen_MemoryRoutine9(int size, bool store); - - void* Gen_MemoryRoutine9Seq(bool store, bool preinc); - void* Gen_MemoryRoutine7Seq(bool store, bool preinc); - // 0 = switch mode, 1 = stay arm, 2 = stay thumb void* Gen_JumpTo9(int kind); void* Gen_JumpTo7(int kind); - void Comp_BranchSpecialBehaviour(); + void Comp_BranchSpecialBehaviour(bool taken); + + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(GetRXBase() + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - GetRXBase(); + } + + bool IsJITFault(u64 pc); + s64 RewriteMemAccess(u64 pc); + + void SwapCodeRegion() + { + ptrdiff_t offset = GetCodeOffset(); + SetCodePtrUnsafe(OtherCodeRegion); + OtherCodeRegion = offset; + } + + ptrdiff_t OtherCodeRegion; bool Exit; @@ -202,22 +237,20 @@ public: BitSet32 SavedRegs; - u32 JitMemUseableSize; + u32 JitMemSecondarySize; + u32 JitMemMainSize; void* ReadBanked, *WriteBanked; - // [size][store] - void* MemFunc9[3][2]; - void* MemFunc7[3][2]; - - // [store][pre increment] - void* MemFuncsSeq9[2][2]; - // "[code in main ram] - void* MemFuncsSeq7[2][2]; - void* JumpToFuncs9[3]; void* JumpToFuncs7[3]; + std::unordered_map LoadStorePatches; + + // [Num][Size][Sign Extend][Output register] + void* PatchedLoadFuncs[2][3][2][8]; + void* PatchedStoreFuncs[2][3][8]; + RegisterCache RegCache; bool CPSRDirty = false; diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s new file mode 100644 index 0000000..536a478 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Linkage.s @@ -0,0 +1,68 @@ +#include "../ARMJIT_x64/ARMJIT_Offsets.h" + +.text + +#define RCPSR W27 +#define RCycles W28 +#define RCPU X29 + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: + stp x19, x20, [sp, #-96]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + mov RCPU, x0 + ldr RCycles, [RCPU, ARM_Cycles_offset] + ldr RCPSR, [RCPU, ARM_CPSR_offset] + + br x1 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + str RCycles, [RCPU, ARM_Cycles_offset] + str RCPSR, [RCPU, ARM_CPSR_offset] + + ldp x29, x30, [sp, #80] + ldp x27, x28, [sp, #64] + ldp x25, x26, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #96 + + ret + +.p2align 4,,15 + +.global ARM_RestoreContext +ARM_RestoreContext: + mov sp, x0 + + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + ldp x16, x17, [sp, #128] + ldp x18, x19, [sp, #144] + ldp x20, x21, [sp, #160] + ldp x22, x23, [sp, #176] + ldp x24, x25, [sp, #192] + ldp x26, x27, [sp, #208] + ldp x28, x29, [sp, #224] + ldr x30, [sp, #240] + + ldp x17, x18, [sp, #248] + mov sp, x17 + + br x18 \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 6cf710b..b307d0e 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -2,286 +2,62 @@ #include "../Config.h" +#include "../ARMJIT_Memory.h" + using namespace Arm64Gen; namespace ARMJIT { -// W0 - address -// (if store) W1 - value to store -// W2 - code cycles -void* Compiler::Gen_MemoryRoutine9(int size, bool store) +bool Compiler::IsJITFault(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - u32 addressMask; - switch (size) - { - case 32: addressMask = ~3; break; - case 16: addressMask = ~1; break; - case 8: addressMask = ~0; break; - } - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W3, W0, W3); - CMP(W3, W4); - FixupBranch insideDTCM = B(CC_LO); - - UBFX(W4, W0, 24, 8); - CMP(W4, 0x02); - FixupBranch outsideMainRAM = B(CC_NEQ); - ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1)); - MOVP2R(X4, NDS::MainRAM); - if (!store && size == 32) - { - LDR(W3, X3, X4); - ANDI2R(W0, W0, 3); - LSL(W0, W0, 3); - RORV(W0, W3, W0); - } - else if (store) - STRGeneric(size, W1, X3, X4); - else - LDRGeneric(size, false, W0, X3, X4); - RET(); - - SetJumpTarget(outsideMainRAM); - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W3); - FixupBranch insideITCM = B(CC_LO); - - if (store) - { - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickTailCall(X4, NDS::ARM9Write32); break; - case 16: QuickTailCall(X4, NDS::ARM9Write16); break; - case 8: QuickTailCall(X4, NDS::ARM9Write8); break; - } - } - else - { - if (size == 32) - ABI_PushRegisters({0, 30}); - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickCallFunction(X4, NDS::ARM9Read32); break; - case 16: QuickTailCall (X4, NDS::ARM9Read16); break; - case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break; - } - if (size == 32) - { - ABI_PopRegisters({1, 30}); - ANDI2R(W1, W1, 3); - LSL(W1, W1, 3); - RORV(W0, W0, W1); - RET(); - } - } - - SetJumpTarget(insideDTCM); - ANDI2R(W3, W3, 0x3FFF & addressMask); - ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - - RET(); - - SetJumpTarget(insideITCM); - ANDI2R(W3, W0, 0x7FFF & addressMask); - if (store) - { - ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4); - LSR(W5, W0, 9); - MOVP2R(X4, CodeRanges); - ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W4); - ABI_PushRegisters({1, 3, 30}); - QuickCallFunction(X4, InvalidateByAddr); - ABI_PopRegisters({1, 3, 30}); - SetJumpTarget(null); - } - ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - RET(); - - return res; + return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); } -/* - W0 - base address - X1 - stack space - W2 - values count -*/ -void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc) +s64 Compiler::RewriteMemAccess(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); + ptrdiff_t pcOffset = pc - (u64)GetRXBase(); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W4, W0, W4); - CMP(W4, W5); - FixupBranch insideDTCM = B(CC_LO); + auto it = LoadStorePatches.find(pcOffset); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W4); - FixupBranch insideITCM = B(CC_LO); - - ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once - if (store) + if (it != LoadStorePatches.end()) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM9Write32); + LoadStorePatch patch = it->second; - ABI_PopRegisters({0, 1, 2, 30}); - } - else - { - QuickCallFunction(X4, NDS::ARM9Read32); - MOV(W4, W0); + ptrdiff_t curCodeOffset = GetCodeOffset(); - ABI_PopRegisters({0, 1, 2, 30}); + SetCodePtrUnsafe(pcOffset + patch.PatchOffset); - STR(X4, X1, ArithOption(X2, true)); - } + BL(patch.PatchFunc); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); + for (int i = 0; i < patch.PatchSize / 4 - 1; i++) + HINT(HINT_NOP); - SetJumpTarget(insideDTCM); + FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr()); - ANDI2R(W4, W4, ~3 & 0x3FFF); - ADDI2R(X4, X4, offsetof(ARMv5, DTCM)); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X4); - } - else - { - LDR(W5, RCPU, X4); - STR(X5, X1, ArithOption(X2, true)); - } + SetCodePtrUnsafe(curCodeOffset); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - - SetJumpTarget(insideITCM); - - ANDI2R(W4, W0, ~3 & 0x7FFF); - - ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X6); - } - else - { - LDR(W5, RCPU, X6); - STR(X5, X1, ArithOption(X2, true)); - } + LoadStorePatches.erase(it); - if (store) - { - ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5); - LSR(W6, W4, 9); - MOVP2R(X5, CodeRanges); - ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W5); - ABI_PushRegisters({0, 1, 2, 4, 30}); - MOV(W0, W4); - QuickCallFunction(X5, InvalidateByAddr); - ABI_PopRegisters({0, 1, 2, 4, 30}); - SetJumpTarget(null); + return patch.PatchOffset; } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; + printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc)); + assert(false); } -void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc) +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) { - AlignCode16(); - void* res = GetRXPtr(); + u32 localAddr = LocaliseCodeAddress(Num, addr); - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); - - ABI_PushRegisters({0, 1, 2, 30}); - if (store) + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); + if (invalidLiteralIdx != -1) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM7Write32); - ABI_PopRegisters({0, 1, 2, 30}); + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - else - { - QuickCallFunction(X4, NDS::ARM7Read32); - MOV(W4, W0); - ABI_PopRegisters({0, 1, 2, 30}); - STR(X4, X1, ArithOption(X2, true)); - } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; -} + Comp_AddCycles_CDI(); -void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) if (Thumb || CurInstr.Cond() == 0xE) RegCache.PutLiteral(rd, val); + + return true; } void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) @@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) addressMask = ~3; if (size == 16) addressMask = ~1; + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) + return; + } if (flags & memop_Store) Comp_AddCycles_CD(); else Comp_AddCycles_CDI(); - if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) - { - u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rnMapped = MapReg(rn); - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr); - return; - } + if (Thumb && rn == 15) + { + ANDI2R(W3, rnMapped, ~2); + rnMapped = W3; } + ARM64Reg finalAddr = W0; + if (flags & memop_Post) { - ARM64Reg rdMapped = MapReg(rd); - ARM64Reg rnMapped = MapReg(rn); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; + finalAddr = rnMapped; + MOV(W0, rnMapped); + } - void* memFunc = Num == 0 - ? MemFunc9[size >> 4][!!(flags & memop_Store)] - : MemFunc7[size >> 4][!!((flags & memop_Store))]; + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn)) + if (!offset.IsImm) + Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); + // offset might has become an immediate + if (offset.IsImm) + { + if (offset.Imm) + { + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Imm); + else + ADD(finalAddr, rnMapped, offset.Imm); + } + else if (finalAddr != rnMapped) + MOV(finalAddr, rnMapped); + } + else + { + if (offset.Reg.ShiftType == ST_ROR) { - u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); + offset = Op2(W0); + } - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) - { - ARMv5* cpu5 = (ARMv5*)CurCPU; + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + else + ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + } - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - region.Mem = cpu5->DTCM; - region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } - } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + if (!(flags & memop_Post) && (flags & memop_Writeback)) + MOV(rnMapped, W0); - if (region.Mem != NULL) - { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - MOVP2R(X0, ptr); - if (flags & memop_Store) - STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0); - else - { - LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0); - if (size == 32 && addr & ~0x3) - ROR_(rdMapped, rdMapped, (addr & 0x3) << 3); - } - return; - } + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget))) + { + ptrdiff_t memopStart = GetCodeOffset(); + LoadStorePatch patch; - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } - } + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19] + : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19]; + assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8); - ARM64Reg finalAddr = W0; - if (flags & memop_Post) - { - finalAddr = rnMapped; - MOV(W0, rnMapped); - } + MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + // take a chance at fastmem + if (size > 8) + ANDI2R(W1, W0, addressMask); + + ptrdiff_t loadStorePosition = GetCodeOffset(); if (flags & memop_Store) - MOV(W1, rdMapped); - - if (!offset.IsImm) - Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); - // offset might become an immediate - if (offset.IsImm) { - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Imm); - else - ADD(finalAddr, rnMapped, offset.Imm); + STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7); } else { - if (offset.Reg.ShiftType == ST_ROR) + LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); + if (size == 32) { - ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); - offset = Op2(W0); + UBFIZ(W0, W0, 3, 2); + RORV(rdMapped, rdMapped, W0); } - - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); - else - ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); } - if (!(flags & memop_Post) && (flags & memop_Writeback)) - MOV(rnMapped, W0); + patch.PatchOffset = memopStart - loadStorePosition; + patch.PatchSize = GetCodeOffset() - memopStart; + LoadStorePatches[loadStorePosition] = patch; + } + else + { + void* func = NULL; + if (addrIsStatic) + func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size); - if (inlinePreparation) + if (func) { - if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4) - ANDI2R(rdMapped, W0, 3); - if (size > 8) - ANDI2R(W0, W0, addressMask); + if (flags & memop_Store) + MOV(W1, rdMapped); + QuickCallFunction(X2, (void (*)())func); + + if (!(flags & memop_Store)) + { + if (size == 32) + { + if (staticAddress & 0x3) + ROR_(rdMapped, W0, (staticAddress & 0x3) << 3); + else + MOV(rdMapped, W0); + } + else + { + if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); + } + } } - QuickCallFunction(X2, memFunc); - if (!(flags & memop_Store)) + else { - if (inlinePreparation && !(flags & memop_Store) && size == 32) + if (Num == 0) { - if (constLocalROR32 == 4) + MOV(X1, RCPU); + if (flags & memop_Store) { - LSL(rdMapped, rdMapped, 3); - RORV(rdMapped, W0, rdMapped); + MOV(W2, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite9); break; + case 16: QuickCallFunction(X3, SlowWrite9); break; + case 8: QuickCallFunction(X3, SlowWrite9); break; + } } - else if (constLocalROR32 > 0) - ROR_(rdMapped, W0, constLocalROR32 << 3); else - MOV(rdMapped, W0); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead9); break; + case 16: QuickCallFunction(X3, SlowRead9); break; + case 8: QuickCallFunction(X3, SlowRead9); break; + } + } } - else if (flags & memop_SignExtend) + else { - if (size == 16) - SXTH(rdMapped, W0); - else if (size == 8) - SXTB(rdMapped, W0); + if (flags & memop_Store) + { + MOV(W1, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite7); break; + case 16: QuickCallFunction(X3, SlowWrite7); break; + case 8: QuickCallFunction(X3, SlowWrite7); break; + } + } else - assert("What's wrong with you?"); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead7); break; + case 16: QuickCallFunction(X3, SlowRead7); break; + case 8: QuickCallFunction(X3, SlowRead7); break; + } + } } - else - MOV(rdMapped, W0); - - if (CurInstr.Info.Branches()) + + if (!(flags & memop_Store)) { - if (size < 32) - printf("LDR size < 32 branching?\n"); - Comp_JumpTo(rdMapped, Num == 0, false); + if (size == 32) + MOV(rdMapped, W0); + else if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); } } } + + if (CurInstr.Info.Branches()) + { + if (size < 32) + printf("LDR size < 32 branching?\n"); + Comp_JumpTo(rdMapped, Num == 0, false); + } } void Compiler::A_Comp_MemWB() @@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + u32 offset = ((CurInstr.Instr & 0xFF) << 2); + u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - { - Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr); - Comp_AddCycles_CDI(); - } - else - { - bool negative = addr < R15; - u32 abs = negative ? R15 - addr : addr - R15; - Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0); - } + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() @@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (regsCount == 0) return 0; // actually not the right behaviour TODO: fix me - SUB(SP, SP, ((regsCount + 1) & ~1) * 8); - if (store) + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + + if (store) Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); - if (usermode && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + int expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + bool compileFastPath = Config::JIT_FastMemory + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget)); + + if (decrement) + { + SUB(W0, MapReg(rn), regsCount * 4); + ANDI2R(W0, W0, ~3); + preinc ^= true; + } + else + { + ANDI2R(W0, MapReg(rn), ~3); + } + + LoadStorePatch patch; + if (compileFastPath) + { + ptrdiff_t fastPathStart = GetCodeOffset(); + ptrdiff_t firstLoadStoreOffset; + + bool firstLoadStore = true; + + MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + ADD(X1, X1, X0); + + u32 offset = preinc ? 4 : 0; + BitSet16::Iterator it = regs.begin(); + + if (regsCount & 1) + { + int reg = *it; + it++; + + ARM64Reg first = W3; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STR(INDEX_UNSIGNED, first, X1, offset); + else + LDR(INDEX_UNSIGNED, first, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + + offset += 4; + } + + while (it != regs.end()) + { + int reg = *it; + it++; + int nextReg = *it; + it++; - int i = regsCount - 1; + ARM64Reg first = W3, second = W4; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + if (RegCache.LoadedRegs & (1 << nextReg)) + second = MapReg(nextReg); + else if (store) + LoadReg(nextReg, second); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STP(INDEX_SIGNED, first, second, X1, offset); + else + LDP(INDEX_SIGNED, first, second, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store) + SaveReg(nextReg, second); + + offset += 8; + } + + patch.PatchSize = GetCodeOffset() - fastPathStart; + patch.PatchOffset = fastPathStart - firstLoadStoreOffset; + SwapCodeRegion(); + patch.PatchFunc = GetRXPtr(); + + LoadStorePatches[firstLoadStoreOffset] = patch; + + ABI_PushRegisters({30}); + } + + int i = 0; + + SUB(SP, SP, ((regsCount + 1) & ~1) * 8); + if (store) + { + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W5, RCPSR, 0, 5); BitSet16::Iterator it = regs.begin(); while (it != regs.end()) @@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (usermode && reg >= 8 && reg < 15) { - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) MOV(W3, MapReg(reg)); else LoadReg(reg, W3); @@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else if (!usermode && nextReg != regs.end()) { - ARM64Reg first = W3; - ARM64Reg second = W4; + ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); else LoadReg(reg, W3); - if (RegCache.Mapping[*nextReg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); else LoadReg(*nextReg, W4); - STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); - i--; + i++; it++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) + { STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8); + } else { LoadReg(reg, W3); STR(INDEX_UNSIGNED, W3, SP, i * 8); } - i--; + i++; it++; } } - if (decrement) - { - SUB(W0, MapReg(rn), regsCount * 4); - preinc ^= true; - } - else - MOV(W0, MapReg(rn)); + ADD(X1, SP, 0); MOVI2R(W2, regsCount); - BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]); + if (Num == 0) + { + MOV(X3, RCPU); + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer9); break; + } + } + else + { + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer7); break; + } + } if (!store) { - Comp_AddCycles_CDI(); - if (usermode && !regs[15] && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + UBFX(W5, RCPSR, 0, 5); - int i = regsCount - 1; BitSet16::Iterator it = regs.begin(); while (it != regs.end()) { @@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc MOVI2R(W1, reg - 8); BL(WriteBanked); FixupBranch alreadyWritten = CBNZ(W4); - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) MOV(MapReg(reg), W3); - RegCache.DirtyRegs |= 1 << reg; - } else SaveReg(reg, W3); SetJumpTarget(alreadyWritten); @@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; - } - if (RegCache.Mapping[*nextReg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); - if (*nextReg != 15) - RegCache.DirtyRegs |= 1 << *nextReg; - } - LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); if (first == W3) SaveReg(reg, W3); @@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SaveReg(*nextReg, W4); it++; - i--; + i++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) { ARM64Reg mapped = MapReg(reg); LDR(INDEX_UNSIGNED, mapped, SP, i * 8); - - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; } else { @@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } it++; - i--; + i++; } } ADD(SP, SP, ((regsCount + 1) & ~1) * 8); + if (compileFastPath) + { + ABI_PopRegisters({30}); + RET(); + + FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr()); + SwapCodeRegion(); + } + if (!store && regs[15]) { ARM64Reg mapped = MapReg(15); diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h new file mode 100644 index 0000000..513c103 --- /dev/null +++ b/src/ARMJIT_Compiler.h @@ -0,0 +1,12 @@ +#if defined(__x86_64__) +#include "ARMJIT_x64/ARMJIT_Compiler.h" +#elif defined(__aarch64__) +#include "ARMJIT_A64/ARMJIT_Compiler.h" +#else +#error "The current target platform doesn't have a JIT backend" +#endif + +namespace ARMJIT +{ +extern Compiler* JITCompiler; +} \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 4e45760..19684c4 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -3,8 +3,11 @@ #include "types.h" #include +#include +#include #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // here lands everything which doesn't fit into ARMJIT.h // where it would be included by pretty much everything @@ -160,8 +163,8 @@ public: Data.SetLength(numAddresses * 2 + numLiterals); } - u32 PseudoPhysicalAddr; - + u32 StartAddr; + u32 StartAddrLocal; u32 InstrHash, LiteralHash; u8 Num; u16 NumAddresses; @@ -175,28 +178,8 @@ public: { return &Data[NumAddresses]; } u32* Literals() { return &Data[NumAddresses * 2]; } - u32* Links() - { return &Data[NumAddresses * 2 + NumLiterals]; } - - u32 NumLinks() - { return Data.Length - NumAddresses * 2 - NumLiterals; } - - void AddLink(u32 link) - { - Data.Add(link); - } - - void ResetLinks() - { - Data.SetLength(NumAddresses * 2 + NumLiterals); - } private: - /* - 0.. Data; }; @@ -207,45 +190,32 @@ struct __attribute__((packed)) AddressRange u32 Code; }; -extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemoryStatus9[0x800000]; -extern u8 MemoryStatus7[0x800000]; - extern TinyVector InvalidLiterals; -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); - -template -void LinkBlock(ARM* cpu, u32 codeOffset); +extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count]; -enum +inline bool PageContainsCode(AddressRange* range) { - memregion_Other = 0, - memregion_ITCM, - memregion_DTCM, - memregion_BIOS9, - memregion_MainRAM, - memregion_SWRAM9, - memregion_SWRAM7, - memregion_IO9, - memregion_VRAM, - memregion_BIOS7, - memregion_WRAM7, - memregion_IO7, - memregion_Wifi, - memregion_VWRAM, -}; + for (int i = 0; i < 8; i++) + { + if (range[i].Blocks.Length > 0) + return true; + } + return false; +} + +u32 LocaliseCodeAddress(u32 num, u32 addr); -int ClassifyAddress9(u32 addr); -int ClassifyAddress7(u32 addr); +template +void LinkBlock(ARM* cpu, u32 codeOffset); -template T SlowRead9(ARMv5* cpu, u32 addr); -template void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template T SlowRead9(u32 addr, ARMv5* cpu); +template void SlowWrite9(u32 addr, ARMv5* cpu, T val); template T SlowRead7(u32 addr); template void SlowWrite7(u32 addr, T val); diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp new file mode 100644 index 0000000..162827d --- /dev/null +++ b/src/ARMJIT_Memory.cpp @@ -0,0 +1,822 @@ +#ifdef __SWITCH__ +#include "switch/compat_switch.h" +#endif + +#include "ARMJIT_Memory.h" + +#include "ARMJIT_Internal.h" +#include "ARMJIT_Compiler.h" + +#include "GPU.h" +#include "GPU3D.h" +#include "Wifi.h" +#include "NDSCart.h" +#include "SPU.h" + +#include + +/* + We're handling fastmem here. + + Basically we're repurposing a big piece of virtual memory + and map the memory regions as they're structured on the DS + in it. + + On most systems you have a single piece of main ram, + maybe some video ram and faster cache RAM and that's about it. + Here we have not only a lot more different memory regions, + but also two address spaces. Not only that but they all have + mirrors (the worst case is 16kb SWRAM which is mirrored 1024x). + + We handle this by only mapping those regions which are actually + used and by praying the games don't go wild. + + Beware, this file is full of platform specific code. + +*/ + +namespace ARMJIT_Memory +{ +#ifdef __aarch64__ +struct FaultDescription +{ + u64 IntegerRegisters[33]; + u64 FaultAddr; + + u32 GetEmulatedAddr() + { + // now this is podracing + return (u32)IntegerRegisters[0]; + } + u64 RealAddr() + { + return FaultAddr; + } + + u64 GetPC() + { + return IntegerRegisters[32]; + } + + void RestoreAndRepeat(s64 offset); +}; +#else +struct FaultDescription +{ + u64 GetPC() + { + return 0; + } + + u32 GetEmulatedAddr() + { + return 0; + } + u64 RealAddr() + { + return 0; + } + + void RestoreAndRepeat(s64 offset); +}; +#endif + +void FaultHandler(FaultDescription* faultDesc); +} + + +#ifdef __aarch64__ + +extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + +#endif + +#ifdef __SWITCH__ +// with LTO the symbols seem to be not properly overriden +// if they're somewhere else + +extern "C" +{ +extern char __start__; +extern char __rodata_start; + +alignas(16) u8 __nx_exception_stack[0x8000]; +u64 __nx_exception_stack_size = 0x8000; + +void __libnx_exception_handler(ThreadExceptionDump* ctx) +{ + ARMJIT_Memory::FaultDescription desc; + memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29); + desc.IntegerRegisters[29] = ctx->fp.x; + desc.IntegerRegisters[30] = ctx->lr.x; + desc.IntegerRegisters[31] = ctx->sp.x; + desc.IntegerRegisters[32] = ctx->pc.x; + + ARMJIT_Memory::FaultHandler(&desc); + + if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) + { + printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); + } + else + { + printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + } +} + +} +#endif + +namespace ARMJIT_Memory +{ + +#ifdef __aarch64__ +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + IntegerRegisters[32] += offset; + + ARM_RestoreContext(IntegerRegisters); +} +#else +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + +} +#endif + +void* FastMem9Start, *FastMem7Start; + +const u32 MemoryTotalSize = + NDS::MainRAMSize + + NDS::SharedWRAMSize + + NDS::ARM7WRAMSize + + DTCMPhysicalSize; + +const u32 MemBlockMainRAMOffset = 0; +const u32 MemBlockSWRAMOffset = NDS::MainRAMSize; +const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize; +const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize; + +const u32 OffsetsPerRegion[memregions_Count] = +{ + UINT32_MAX, + UINT32_MAX, + MemBlockDTCMOffset, + UINT32_MAX, + MemBlockMainRAMOffset, + MemBlockSWRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockARM7WRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, +}; + +enum +{ + memstate_Unmapped, + memstate_MappedRW, + // on switch this is unmapped as well + memstate_MappedProtected, +}; + +u8 MappingStatus9[1 << (32-12)]; +u8 MappingStatus7[1 << (32-12)]; + +#ifdef __SWITCH__ +u8* MemoryBase; +u8* MemoryBaseCodeMem; +#else +u8* MemoryBase; +#endif + +bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size)); + return R_SUCCEEDED(r); +#endif +} + +bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size); + printf("%x\n", r); + return R_SUCCEEDED(r); +#endif +} + +struct Mapping +{ + u32 Addr; + u32 Size, LocalOffset; + u32 Num; + + void Unmap(int region) + { + bool skipDTCM = Num == 0 && region != memregion_DTCM; + u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7; + u32 offset = 0; + while (offset < Size) + { + if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + printf("%x skip\n", NDS::ARM9->DTCMSize); + } + else + { + u32 segmentOffset = offset; + u8 status = statuses[(Addr + offset) >> 12]; + while (statuses[(Addr + offset) >> 12] == status + && offset < Size + && (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase)) + { + assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); + statuses[(Addr + offset) >> 12] = memstate_Unmapped; + offset += 0x1000; + } + + if (status == memstate_MappedRW) + { + u32 segmentSize = offset - segmentOffset; + printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + assert(success); + } + } + } + } +}; +ARMJIT::TinyVector Mappings[memregions_Count]; + +void SetCodeProtection(int region, u32 offset, bool protect) +{ + offset &= ~0xFFF; + printf("set code protection %d %x %d\n", region, offset, protect); + + for (int i = 0; i < Mappings[region].Length; i++) + { + Mapping& mapping = Mappings[region][i]; + + u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (mapping.Num == 0 + && region != memregion_DTCM + && effectiveAddr >= NDS::ARM9->DTCMBase + && effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + + u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); + + printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num); + assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); + states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; + + bool success; + if (protect) + success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + else + success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + assert(success); + } +} + +void RemapDTCM(u32 newBase, u32 newSize) +{ + // this first part could be made more efficient + // by unmapping DTCM first and then map the holes + u32 oldDTCMBase = NDS::ARM9->DTCMBase; + u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize; + + u32 newEnd = newBase + newSize; + + printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd); + // unmap all regions containing the old or the current DTCM mapping + for (int region = 0; region < memregions_Count; region++) + { + if (region == memregion_DTCM) + continue; + + for (int i = 0; i < Mappings[region].Length;) + { + Mapping& mapping = Mappings[region][i]; + + u32 start = mapping.Addr; + u32 end = mapping.Addr + mapping.Size; + + printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); + + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end)); + bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end)); + + if (mapping.Num == 0 && (oldOverlap || newOverlap)) + { + mapping.Unmap(region); + Mappings[region].Remove(i); + } + else + { + i++; + } + } + } + + for (int i = 0; i < Mappings[memregion_DTCM].Length; i++) + { + Mappings[memregion_DTCM][i].Unmap(memregion_DTCM); + } + Mappings[memregion_DTCM].Clear(); +} + +void RemapSWRAM() +{ + printf("remapping SWRAM\n"); + for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++) + { + Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM); + } + Mappings[memregion_SWRAM].Clear(); + for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) + { + Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); + } + Mappings[memregion_WRAM7].Clear(); +} + +bool MapAtAddress(u32 addr) +{ + u32 num = NDS::CurCPU; + + int region = num == 0 + ? ClassifyAddress9(addr) + : ClassifyAddress7(addr); + + if (!IsMappable(region)) + return false; + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize); + + if (!isMapped) + return false; + + // this calculation even works with DTCM + // which doesn't have to be aligned to it's own size + u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart; + + u8* states = num == 0 ? MappingStatus9 : MappingStatus7; + printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset); + bool isExecutable = ARMJIT::CodeMemRegions[region]; + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset; + + // this overcomplicated piece of code basically just finds whole pieces of code memory + // which can be mapped + u32 offset = 0; + bool skipDTCM = num == 0 && region != memregion_DTCM; + while (offset < memorySize) + { + if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + } + else + { + u32 sectionOffset = offset; + bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); + while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) + && offset < memorySize + && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) + { + assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); + states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW; + offset += 0x1000; + } + + u32 sectionSize = offset - sectionOffset; + + if (!hasCode) + { + printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); + bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); + assert(succeded); + } + } + } + + Mapping mapping{mirrorStart, memorySize, memoryOffset, num}; + Mappings[region].Add(mapping); + + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1); + + return true; +} + +void FaultHandler(FaultDescription* faultDesc) +{ + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC())) + { + bool rewriteToSlowPath = true; + + u32 addr = faultDesc->GetEmulatedAddr(); + + if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) + rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr()); + + s64 offset = 0; + if (rewriteToSlowPath) + { + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC()); + } + faultDesc->RestoreAndRepeat(offset); + } +} + +void Init() +{ +#if defined(__SWITCH__) + MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize); + MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + (u64)MemoryBase, MemoryTotalSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + MemoryTotalSize, Perm_Rw)); + assert(succeded); + + // 8 GB of address space, just don't ask... + FastMem9Start = virtmemReserve(0x100000000); + assert(FastMem9Start); + FastMem7Start = virtmemReserve(0x100000000); + assert(FastMem7Start); + + NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset; +#else + MemoryBase = new u8[MemoryTotalSize]; + + NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset; +#endif +} + +void DeInit() +{ +#if defined(__SWITCH__) + virtmemFree(FastMem9Start, 0x100000000); + virtmemFree(FastMem7Start, 0x100000000); + + svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); + virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); + free(MemoryBase); +#else + delete[] MemoryBase; +#endif +} + +void Reset() +{ + for (int region = 0; region < memregions_Count; region++) + { + for (int i = 0; i < Mappings[region].Length; i++) + Mappings[region][i].Unmap(region); + Mappings[region].Clear(); + } + + for (int i = 0; i < sizeof(MappingStatus9); i++) + { + assert(MappingStatus9[i] == memstate_Unmapped); + assert(MappingStatus7[i] == memstate_Unmapped); + } + + printf("done resetting jit mem\n"); +} + +bool IsMappable(int region) +{ + return OffsetsPerRegion[region] != UINT32_MAX; +} + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize) +{ + memoryOffset = 0; + switch (region) + { + case memregion_ITCM: + if (num == 0) + { + mappingStart = 0; + mappingSize = NDS::ARM9->ITCMSize; + memorySize = ITCMPhysicalSize; + return true; + } + return false; + case memregion_DTCM: + if (num == 0) + { + mappingStart = NDS::ARM9->DTCMBase; + mappingSize = NDS::ARM9->DTCMSize; + memorySize = DTCMPhysicalSize; + return true; + } + return false; + case memregion_BIOS9: + if (num == 0) + { + mappingStart = 0xFFFF0000; + mappingSize = 0x10000; + memorySize = 0x1000; + return true; + } + return false; + case memregion_MainRAM: + mappingStart = 0x2000000; + mappingSize = 0x1000000; + memorySize = NDS::MainRAMSize; + return true; + case memregion_SWRAM: + mappingStart = 0x3000000; + if (num == 0 && NDS::SWRAM_ARM9.Mem) + { + mappingSize = 0x1000000; + memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM9.Mask + 1; + return true; + } + else if (num == 1 && NDS::SWRAM_ARM7.Mem) + { + mappingSize = 0x800000; + memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM7.Mask + 1; + return true; + } + return false; + case memregion_VRAM: + if (num == 0) + { + // this is a gross simplification + // mostly to make code on vram working + // it doesn't take any of the actual VRAM mappings into account + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x100000; + return true; + } + return false; + case memregion_BIOS7: + if (num == 1) + { + mappingStart = 0; + mappingSize = 0x4000; + memorySize = 0x4000; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + if (NDS::SWRAM_ARM7.Mem) + { + mappingStart = 0x3800000; + mappingSize = 0x800000; + } + else + { + mappingStart = 0x3000000; + mappingSize = 0x1000000; + } + memorySize = NDS::ARM7WRAMSize; + return true; + } + return false; + case memregion_VWRAM: + if (num == 1) + { + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x20000; + return true; + } + return false; + default: + // for the JIT we don't are about the rest + return false; + } +} + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM9.Mem) + return memregion_SWRAM; + else + return memregion_Other; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7.Mem) + return memregion_SWRAM; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} +template +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } +} + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + switch (addr & 0xFF000000) + { + case 0x04000000: + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead; + case 9: return NULL; + case 16: return (void*)VRAMRead; + case 17: return (void*)VRAMWrite; + case 32: return (void*)VRAMRead; + case 33: return (void*)VRAMWrite; + } + break; + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size >= 16) + { + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } + } + break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7; + case 9: return (void*)GPU::WriteVRAM_ARM7; + case 16: return (void*)GPU::ReadVRAM_ARM7; + case 17: return (void*)GPU::WriteVRAM_ARM7; + case 32: return (void*)GPU::ReadVRAM_ARM7; + case 33: return (void*)GPU::WriteVRAM_ARM7; + } + } + } + return NULL; +} + +} \ No newline at end of file diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h new file mode 100644 index 0000000..1a59d98 --- /dev/null +++ b/src/ARMJIT_Memory.h @@ -0,0 +1,53 @@ +#ifndef ARMJIT_MEMORY +#define ARMJIT_MEMORY + +#include "types.h" + +#include "ARM.h" + +namespace ARMJIT_Memory +{ + +extern void* FastMem9Start; +extern void* FastMem7Start; + +void Init(); +void DeInit(); + +void Reset(); + +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, + memregions_Count +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize); + +bool IsMappable(int region); + +void RemapDTCM(u32 newBase, u32 newSize); +void RemapSWRAM(); + +void SetCodeProtection(int region, u32 offset, bool protect); + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fd3fb70..34c1c91 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -301,24 +301,6 @@ Compiler::Compiler() RET(); } - { - CPSRDirty = true; - BranchStub[0] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<0>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - - CPSRDirty = true; - BranchStub[1] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<1>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - } - // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -520,6 +502,11 @@ void Compiler::Reset() FarCode = FarStart; } +bool Compiler::IsJITFault(u64 addr) +{ + return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory); +} + void Compiler::Comp_SpecialBranchBehaviour(bool taken) { if (taken && CurInstr.BranchFlags & branch_IdleBranch) @@ -531,32 +518,11 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) RegCache.PrepareExit(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) - && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)&ARM_Ret, true); - } + JMP((u8*)&ARM_Ret, true); } } -JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... { @@ -575,7 +541,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; // CPSR might have been modified in a previous block - CPSRDirty = Config::JIT_BrancheOptimisations == 2; + CPSRDirty = false; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); @@ -685,31 +651,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F RegCache.Flush(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 - && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) - && (!instrs[instrsCount - 1].Info.Branches() - || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken - || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)ARM_Ret, true); - } + JMP((u8*)ARM_Ret, true); /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -720,22 +662,6 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F return res; } -void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - JMP((u8*)entry, true); - SetCodePtr(curPtr); -} - -void Compiler::UnlinkBlock(u32 offset) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - NOP(5); - SetCodePtr(curPtr); -} - void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index f2fc301..09ac257 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -52,10 +52,7 @@ public: void Reset(); - void LinkBlock(u32 offset, JitBlockEntry entry); - void UnlinkBlock(u32 offset); - - JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -202,6 +199,10 @@ public: SetCodePtr(FarCode); } + bool IsJITFault(u64 addr); + + s32 RewriteMemAccess(u64 pc); + u8* FarCode; u8* NearCode; u32 FarSize; @@ -216,8 +217,6 @@ public: bool Exit; bool IrregularCycles; - void* BranchStub[2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index cf0bd23..0bf2f83 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -15,6 +15,11 @@ int squeezePointer(T* ptr) return truncated; } +s32 Compiler::RewriteMemAccess(u64 pc) +{ + return 0; +} + /* According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) of all memory load and store instructions always access addresses in the same region as @@ -27,14 +32,15 @@ int squeezePointer(T* ptr) bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); + return false; + //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); if (invalidLiteralIdx != -1) { InvalidLiterals.Remove(invalidLiteralIdx); return false; - } + }*/ u32 val; // make sure arm7 bios is accessible @@ -95,7 +101,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - if (!addrIsStatic) + if (true) { OpArg rnMapped = MapReg(rn); if (Thumb && rn == 15) @@ -145,7 +151,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOV(32, rnMapped, R(finalAddr)); } - int expectedTarget = Num == 0 + /*int expectedTarget = Num == 0 ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); if (CurInstr.Cond() < 0xE) @@ -184,8 +190,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (addrIsStatic && compileSlowPath) MOV(32, R(RSCRATCH3), Imm32(staticAddress)); - - if (compileFastPath) +*/ + /*if (compileFastPath) { FixupBranch slowPath; if (compileSlowPath) @@ -357,15 +363,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz SetJumpTarget(slowPath); } } - - if (compileSlowPath) +*/ + if (true) { PushRegs(false); if (Num == 0) { - MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); - MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); if (flags & memop_Store) { MOV(32, R(ABI_PARAM3), rdMapped); @@ -423,13 +430,13 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); } } - +/* if (compileFastPath && compileSlowPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!(flags & memop_Store) && rd == 15) { @@ -458,7 +465,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc u32 stackAlloc = ((regsCount + 1) & ~1) * 8; #endif u32 allocOffset = stackAlloc - regsCount * 8; - +/* int expectedTarget = Num == 0 ? ClassifyAddress9(CurInstr.DataRegion) : ClassifyAddress7(CurInstr.DataRegion); @@ -479,7 +486,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc default: break; } - +*/ if (!store) Comp_AddCycles_CDI(); else @@ -492,7 +499,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else MOV(32, R(RSCRATCH4), MapReg(rn)); - +/* if (compileFastPath) { assert(!usermode); @@ -570,7 +577,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SwitchToFarCode(); SetJumpTarget(slowPath); - } + }*/ if (!store) { @@ -696,13 +703,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc PopRegs(false); } - +/* if (compileFastPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!store && regs[15]) { diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index b50e821..ccec951 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -206,15 +206,14 @@ enum { T_ReadR14 = 1 << 13, T_WriteR14 = 1 << 14, - T_PopPC = 1 << 15, - - T_SetNZ = 1 << 16, - T_SetCV = 1 << 17, - T_SetMaybeC = 1 << 18, - T_ReadC = 1 << 19, - T_SetC = 1 << 20, + T_SetNZ = 1 << 15, + T_SetCV = 1 << 16, + T_SetMaybeC = 1 << 17, + T_ReadC = 1 << 18, + T_SetC = 1 << 19, - T_WriteMem = 1 << 21, + T_WriteMem = 1 << 20, + T_LoadMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -256,31 +255,31 @@ const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); -const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); +const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL); const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); -const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); -const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); +const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG); +const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG); const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); -const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); -const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); -const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); +const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG); +const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG); +const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG); const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); -const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); +const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM); const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); -const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); +const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM); const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); -const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); +const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM); const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); -const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); +const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL); const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); -const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); +const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP); -const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); +const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA); const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); @@ -347,7 +346,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_BranchAlways) res.DstRegs |= (1 << 15); - if (data & T_PopPC && instr & (1 << 8)) + if (res.Kind == tk_POP && instr & (1 << 8)) res.DstRegs |= 1 << 15; if (data & T_SetNZ) @@ -364,11 +363,18 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_WriteMem) res.SpecialKind = special_WriteMem; - if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) + if (data & T_LoadMem) { - if (!Config::JIT_LiteralOptimisations) - res.SrcRegs |= 1 << 15; - res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDR_PCREL) + { + if (!Config::JIT_LiteralOptimisations) + res.SrcRegs |= 1 << 15; + res.SpecialKind = special_LoadLiteral; + } + else + { + res.SpecialKind = special_LoadMem; + } } if (res.Kind == tk_LDMIA || res.Kind == tk_POP) @@ -401,11 +407,17 @@ Info Decode(bool thumb, u32 num, u32 instr) else if ((instr >> 28) == 0xF) data = ak(ak_Nop); - if (data & A_UnkOnARM7 && num != 0) + if (data & A_UnkOnARM7 && num == 1) data = A_UNK; res.Kind = (data >> 22) & 0x1FF; + if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1) + { + data = ak(ak_Nop); + res.Kind = ak_Nop; + } + if (res.Kind == ak_MCR) { u32 cn = (instr >> 16) & 0xF; @@ -490,8 +502,13 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_WriteMem) res.SpecialKind = special_WriteMem; - if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) - res.SpecialKind = special_LoadLiteral; + if (data & A_LoadMem) + { + if (res.SrcRegs == (1 << 15)) + res.SpecialKind = special_LoadLiteral; + else + res.SpecialKind = special_LoadMem; + } if (res.Kind == ak_LDM) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 6ab4929..a702435 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -232,6 +232,7 @@ enum { special_NotSpecialAtAll = 0, special_WriteMem, + special_LoadMem, special_WaitForInterrupt, special_LoadLiteral }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f35b3e9..84bbc2b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,9 +55,11 @@ if (ENABLE_JIT) enable_language(ASM) target_sources(core PRIVATE - ARMJIT.cpp ARM_InstrInfo.cpp + ARMJIT.cpp + ARMJIT_Memory.cpp + dolphin/CommonFuncs.cpp ) @@ -85,6 +87,8 @@ if (ENABLE_JIT) ARMJIT_A64/ARMJIT_ALU.cpp ARMJIT_A64/ARMJIT_LoadStore.cpp ARMJIT_A64/ARMJIT_Branch.cpp + + ARMJIT_A64/ARMJIT_Linkage.s ) endif() endif() diff --git a/src/CP15.cpp b/src/CP15.cpp index 225847e..3d64259 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -22,6 +22,7 @@ #include "DSi.h" #include "ARM.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // access timing for cached regions @@ -42,8 +43,8 @@ void ARMv5::CP15Reset() DTCMSetting = 0; ITCMSetting = 0; - memset(ITCM, 0, 0x8000); - memset(DTCM, 0, 0x4000); + memset(ITCM, 0, ITCMPhysicalSize); + memset(DTCM, 0, DTCMPhysicalSize); ITCMSize = 0; DTCMBase = 0xFFFFFFFF; @@ -75,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&DTCMSetting); file->Var32(&ITCMSetting); - file->VarArray(ITCM, 0x8000); - file->VarArray(DTCM, 0x4000); + file->VarArray(ITCM, ITCMPhysicalSize); + file->VarArray(DTCM, DTCMPhysicalSize); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -98,36 +99,30 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { -#ifdef JIT_ENABLED - u32 oldDTCMBase = DTCMBase; - u32 oldDTCMSize = DTCMSize; -#endif + u32 newDTCMBase; + u32 newDTCMSize; if (CP15Control & (1<<16)) { - DTCMBase = DTCMSetting & 0xFFFFF000; - DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); + newDTCMBase = DTCMSetting & 0xFFFFF000; + newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize); } else { - DTCMBase = 0xFFFFFFFF; - DTCMSize = 0; + newDTCMBase = 0xFFFFFFFF; + newDTCMSize = 0; //printf("DTCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize) { - ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); - ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize); + DTCMBase = newDTCMBase; + DTCMSize = newDTCMSize; } -#endif } void ARMv5::UpdateITCMSetting() { -#ifdef JIT_ENABLED - u32 oldITCMSize = ITCMSize; -#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -138,10 +133,6 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldITCMSize != ITCMSize) - ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); -#endif } @@ -581,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: ICacheInvalidateAll(); + //Halt(255); return; case 0x751: ICacheInvalidateByAddr(val); + //Halt(255); return; case 0x752: printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + //Halt(255); return; @@ -723,7 +717,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { CodeCycles = 1; - return *(u32*)&ITCM[addr & 0x7FFF]; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } CodeCycles = RegionCodeCycles; @@ -750,13 +744,13 @@ void ARMv5::DataRead8(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u8*)&ITCM[addr & 0x7FFF]; + *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -773,13 +767,13 @@ void ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u16*)&ITCM[addr & 0x7FFF]; + *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -796,13 +790,13 @@ void ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -817,13 +811,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -838,16 +832,16 @@ void ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; - *(u8*)&ITCM[addr & 0x7FFF] = val; + *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -864,16 +858,16 @@ void ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; - *(u16*)&ITCM[addr & 0x7FFF] = val; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -890,16 +884,16 @@ void ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -914,16 +908,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } diff --git a/src/Config.cpp b/src/Config.cpp index 22e9c11..edf84f2 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -47,8 +47,9 @@ int JIT_LiteralOptimisations = true; #ifdef JIT_ENABLED int JIT_Enable = false; int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = 2; +int JIT_BrancheOptimisations = true; int JIT_LiteralOptimisations = true; +int JIT_FastMemory = true; #endif ConfigEntry ConfigFile[] = @@ -72,8 +73,9 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, + {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 31fa67a..7b19a4b 100644 --- a/src/Config.h +++ b/src/Config.h @@ -63,6 +63,7 @@ extern int JIT_Enable; extern int JIT_MaxBlockSize; extern int JIT_BrancheOptimisations; extern int JIT_LiteralOptimisations; +extern int JIT_FastMemory; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 657241f..3d65482 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -33,6 +33,7 @@ #include "AREngine.h" #include "Platform.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -94,17 +95,17 @@ u32 CPUStop; u8 ARM9BIOS[0x1000]; u8 ARM7BIOS[0x4000]; -u8 MainRAM[0x1000000]; +u8* MainRAM; u32 MainRAMMask; -u8 SharedWRAM[0x8000]; +u8* SharedWRAM; u8 WRAMCnt; -u8* SWRAM_ARM9; -u8* SWRAM_ARM7; -u32 SWRAM_ARM9Mask; -u32 SWRAM_ARM7Mask; -u8 ARM7WRAM[0x10000]; +// putting them together so they're always next to each other +MemRegion SWRAM_ARM9; +MemRegion SWRAM_ARM7; + +u8* ARM7WRAM; u16 ExMemCnt[2]; @@ -171,6 +172,10 @@ bool Init() #ifdef JIT_ENABLED ARMJIT::Init(); +#else + MainRAM = new u8[MainRAMSize]; + ARM7WRAM = new u8[ARM7WRAMSize]; + SharedWRAM = new u8[SharedWRAMSize]; #endif DMAs[0] = new DMA(0, 0); @@ -485,6 +490,10 @@ void Reset() printf("ARM7 BIOS loaded\n"); fclose(f); } + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif if (ConsoleType == 1) { @@ -510,7 +519,7 @@ void Reset() InitTimings(); - memset(MainRAM, 0, 0x1000000); + memset(MainRAM, 0, MainRAMMask + 1); memset(SharedWRAM, 0, 0x8000); memset(ARM7WRAM, 0, 0x10000); @@ -587,10 +596,6 @@ void Reset() } AREngine::Reset(); - -#ifdef JIT_ENABLED - ARMJIT::Reset(); -#endif } void Stop() @@ -705,7 +710,7 @@ bool DoSavestate(Savestate* file) file->VarArray(MainRAM, 0x400000); file->VarArray(SharedWRAM, 0x8000); - file->VarArray(ARM7WRAM, 0x10000); + file->VarArray(ARM7WRAM, ARM7WRAMSize); file->VarArray(ExMemCnt, 2*sizeof(u16)); file->VarArray(ROMSeed0, 2*8); @@ -1128,43 +1133,40 @@ void MapSharedWRAM(u8 val) if (val == WRAMCnt) return; + ARMJIT_Memory::RemapSWRAM(); + WRAMCnt = val; switch (WRAMCnt & 0x3) { case 0: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x7FFF; - SWRAM_ARM7 = NULL; - SWRAM_ARM7Mask = 0; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x7FFF; + SWRAM_ARM7.Mem = NULL; + SWRAM_ARM7.Mask = 0; break; case 1: - SWRAM_ARM9 = &SharedWRAM[0x4000]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 2: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0x4000]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 3: - SWRAM_ARM9 = NULL; - SWRAM_ARM9Mask = 0; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x7FFF; + SWRAM_ARM9.Mem = NULL; + SWRAM_ARM9.Mask = 0; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x7FFF; break; } - -#ifdef JIT_ENABLED - ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); - ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); -#endif } @@ -1835,12 +1837,12 @@ u8 ARM9Read8(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u8*)&MainRAM[addr & MainRAMMask]; + return *(u8*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1900,12 +1902,12 @@ u16 ARM9Read16(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u16*)&MainRAM[addr & MainRAMMask]; + return *(u16*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1968,9 +1970,9 @@ u32 ARM9Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -2026,7 +2028,7 @@ void ARM9Write8(u32 addr, u8 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2035,12 +2037,12 @@ void ARM9Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2085,7 +2087,7 @@ void ARM9Write16(u32 addr, u16 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2094,12 +2096,12 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2113,18 +2115,16 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC(addr, val); - return; + default: GPU::WriteVRAM_LCDC(addr, val); return; } case 0x07000000: @@ -2165,7 +2165,7 @@ void ARM9Write32(u32 addr, u32 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2174,12 +2174,12 @@ void ARM9Write32(u32 addr, u32 val) return ; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2193,18 +2193,16 @@ void ARM9Write32(u32 addr, u32 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC(addr, val); - return; + default: GPU::WriteVRAM_LCDC(addr, val); return; } case 0x07000000: @@ -2250,10 +2248,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) return true; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - region->Mem = SWRAM_ARM9; - region->Mask = SWRAM_ARM9Mask; + region->Mem = SWRAM_ARM9.Mem; + region->Mask = SWRAM_ARM9.Mask; return true; } break; @@ -2292,17 +2290,17 @@ u8 ARM7Read8(u32 addr) return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead8(addr); @@ -2352,17 +2350,17 @@ u16 ARM7Read16(u32 addr) return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead16(addr); @@ -2419,17 +2417,17 @@ u32 ARM7Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead32(addr); @@ -2474,7 +2472,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2483,28 +2481,28 @@ void ARM7Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2514,7 +2512,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2551,7 +2549,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2560,28 +2558,28 @@ void ARM7Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2599,7 +2597,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2638,7 +2636,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2647,28 +2645,28 @@ void ARM7Write32(u32 addr, u32 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2687,7 +2685,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2736,17 +2734,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region) // then access all the WRAM as one contiguous block starting at 0x037F8000 // this case needs a bit of a hack to cover // it's not really worth bothering anyway - if (!SWRAM_ARM7) + if (!SWRAM_ARM7.Mem) { region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } break; case 0x03800000: region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } diff --git a/src/NDS.h b/src/NDS.h index e9b56da..4b4f9a1 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -134,6 +134,7 @@ typedef struct } MemRegion; extern int ConsoleType; +extern int CurCPU; extern u8 ARM9MemTimings[0x40000][4]; extern u8 ARM7MemTimings[0x20000][4]; @@ -161,20 +162,20 @@ extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; extern u16 ARM7BIOSProt; -extern u8 MainRAM[0x1000000]; +extern u8* MainRAM; extern u32 MainRAMMask; -extern u8 SharedWRAM[0x8000]; -extern u8* SWRAM_ARM9; -extern u8* SWRAM_ARM7; -extern u32 SWRAM_ARM9Mask; -extern u32 SWRAM_ARM7Mask; - -extern u8 ARM7WRAM[0x10000]; +const u32 SharedWRAMSize = 0x8000; +extern u8* SharedWRAM; +extern MemRegion SWRAM_ARM9; +extern MemRegion SWRAM_ARM7; extern u32 KeyInput; +const u32 ARM7WRAMSize = 0x10000; +extern u8* ARM7WRAM; + bool Init(); void DeInit(); void Reset(); -- cgit v1.2.3 From c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 30 Jun 2020 23:50:41 +0200 Subject: reconcile DSi and JIT, fastmem for x64 and Windows --- src/ARM.cpp | 23 +- src/ARM.h | 2 +- src/ARMJIT.cpp | 273 +-- src/ARMJIT.h | 2 + src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 4 +- src/ARMJIT_Internal.h | 12 +- src/ARMJIT_Memory.cpp | 636 ++++-- src/ARMJIT_Memory.h | 16 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 109 + src/ARMJIT_x64/ARMJIT_Compiler.h | 14 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 632 +++--- src/CP15.cpp | 21 + src/Config.cpp | 20 +- src/Config.h | 9 +- src/DSi.cpp | 167 +- src/DSi.h | 15 + src/DSi_I2C.cpp | 4 +- src/NDS.cpp | 41 +- src/NDS.h | 2 + src/frontend/qt_sdl/EmuSettingsDialog.cpp | 115 +- src/frontend/qt_sdl/EmuSettingsDialog.h | 5 +- src/frontend/qt_sdl/EmuSettingsDialog.ui | 598 +++--- src/frontend/qt_sdl/main.cpp | 9 +- src/frontend/qt_sdl/main.h | 1 + src/libui_sdl/DlgEmuSettings.cpp | 252 --- src/libui_sdl/libui/ui.h | 764 ------- src/libui_sdl/libui/unix/stddialogs.c | 126 -- src/libui_sdl/libui/windows/stddialogs.cpp | 180 -- src/libui_sdl/main.cpp | 3061 ---------------------------- 29 files changed, 1656 insertions(+), 5457 deletions(-) delete mode 100644 src/libui_sdl/DlgEmuSettings.cpp delete mode 100644 src/libui_sdl/libui/ui.h delete mode 100644 src/libui_sdl/libui/unix/stddialogs.c delete mode 100644 src/libui_sdl/libui/windows/stddialogs.cpp delete mode 100644 src/libui_sdl/main.cpp (limited to 'src/Config.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index e529be8..8530795 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,12 +21,15 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" -#include "ARMJIT.h" #include "Config.h" #include "AREngine.h" #include "ARMJIT.h" #include "Config.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif // instruction timing notes // @@ -109,6 +112,12 @@ void ARM::Reset() CodeMem.Mem = NULL; +#ifdef JIT_ENABLED + FastBlockLookup = NULL; + FastBlockLookupStart = 0; + FastBlockLookupSize = 0; +#endif + // zorp JumpTo(ExceptionBase); } @@ -752,6 +761,12 @@ void ARMv4::Execute() if (Halted == 2) Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } } #ifdef JIT_ENABLED @@ -820,6 +835,12 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } } #endif diff --git a/src/ARM.h b/src/ARM.h index b7f16d6..0248e26 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -147,7 +147,7 @@ public: NDS::MemRegion CodeMem; #ifdef JIT_ENABLED - u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0; + u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; #endif diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 53b28c1..2a61c38 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -18,6 +18,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "DSi.h" #include "GPU.h" #include "GPU3D.h" #include "SPU.h" @@ -38,25 +39,35 @@ namespace ARMJIT Compiler* JITCompiler; AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; -AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512]; AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; AddressRange CodeIndexVRAM[0x100000 / 512]; AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; +AddressRange CodeIndexBIOS9DSi[0x10000 / 512]; +AddressRange CodeIndexBIOS7DSi[0x10000 / 512]; +AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512]; std::unordered_map JitBlocks9; std::unordered_map JitBlocks7; u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; -u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2]; u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; u64 FastBlockLookupVRAM[0x100000 / 2]; u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; +u64 FastBlockLookupBIOS9DSi[0x10000 / 2]; +u64 FastBlockLookupBIOS7DSi[0x10000 / 2]; +u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2]; const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { @@ -64,7 +75,7 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = ITCMPhysicalSize, 0, sizeof(NDS::ARM9BIOS), - NDS::MainRAMSize, + NDS::MainRAMMaxSize, NDS::SharedWRAMSize, 0, 0x100000, @@ -73,6 +84,11 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = 0, 0, 0x40000, + 0x10000, + 0x10000, + sizeof(DSi::NWRAM_A), + sizeof(DSi::NWRAM_B), + sizeof(DSi::NWRAM_C), }; AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = @@ -90,6 +106,11 @@ AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = NULL, NULL, CodeIndexARM7WVRAM, + CodeIndexBIOS9DSi, + CodeIndexBIOS7DSi, + CodeIndexNWRAM_A, + CodeIndexNWRAM_B, + CodeIndexNWRAM_C }; u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = @@ -106,7 +127,12 @@ u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = FastBlockLookupARM7WRAM, NULL, NULL, - FastBlockLookupARM7WVRAM + FastBlockLookupARM7WVRAM, + FastBlockLookupBIOS9DSi, + FastBlockLookupBIOS7DSi, + FastBlockLookupNWRAM_A, + FastBlockLookupNWRAM_B, + FastBlockLookupNWRAM_C }; u32 LocaliseCodeAddress(u32 num, u32 addr) @@ -115,21 +141,14 @@ u32 LocaliseCodeAddress(u32 num, u32 addr) ? ARMJIT_Memory::ClassifyAddress9(addr) : ARMJIT_Memory::ClassifyAddress7(addr); - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, - mappingSize, memoryOffset, memorySize) - && CodeMemRegions[region]) - { - addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; - addr |= (u32)region << 28; - return addr; - } + if (CodeMemRegions[region]) + return ARMJIT_Memory::LocaliseAddress(region, num, addr); return 0; } TinyVector InvalidLiterals; -template +template T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; @@ -141,11 +160,11 @@ T SlowRead9(u32 addr, ARMv5* cpu) else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; else if (std::is_same::value) - val = NDS::ARM9Read32(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr); else if (std::is_same::value) - val = NDS::ARM9Read16(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr); else - val = NDS::ARM9Read8(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr); if (std::is_same::value) return ROR(val, offset << 3); @@ -153,7 +172,7 @@ T SlowRead9(u32 addr, ARMv5* cpu) return val; } -template +template void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); @@ -169,27 +188,19 @@ void SlowWrite9(u32 addr, ARMv5* cpu, T val) } else if (std::is_same::value) { - NDS::ARM9Write32(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val); } else if (std::is_same::value) { - NDS::ARM9Write16(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val); } else { - NDS::ARM9Write8(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val); } } -template void SlowWrite9(u32, ARMv5*, u32); -template void SlowWrite9(u32, ARMv5*, u16); -template void SlowWrite9(u32, ARMv5*, u8); - -template u32 SlowRead9(u32, ARMv5*); -template u16 SlowRead9(u32, ARMv5*); -template u8 SlowRead9(u32, ARMv5*); - -template +template T SlowRead7(u32 addr) { u32 offset = addr & 0x3; @@ -197,11 +208,11 @@ T SlowRead7(u32 addr) T val; if (std::is_same::value) - val = NDS::ARM7Read32(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr); else if (std::is_same::value) - val = NDS::ARM7Read16(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr); else - val = NDS::ARM7Read8(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr); if (std::is_same::value) return ROR(val, offset << 3); @@ -209,67 +220,71 @@ T SlowRead7(u32 addr) return val; } -template +template void SlowWrite7(u32 addr, T val) { addr &= ~(sizeof(T) - 1); if (std::is_same::value) - NDS::ARM7Write32(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val); else if (std::is_same::value) - NDS::ARM7Write16(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val); else - NDS::ARM7Write8(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val); } -template +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; - if (PreInc) - addr += 4; for (int i = 0; i < num; i++) { if (Write) - SlowWrite9(addr, cpu, data[i]); + SlowWrite9(addr, cpu, data[i]); else - data[i] = SlowRead9(addr, cpu); + data[i] = SlowRead9(addr, cpu); addr += 4; } } -template +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; - if (PreInc) - addr += 4; for (int i = 0; i < num; i++) { if (Write) - SlowWrite7(addr, data[i]); + SlowWrite7(addr, data[i]); else - data[i] = SlowRead7(addr); + data[i] = SlowRead7(addr); addr += 4; } } -template void SlowWrite7(u32, u32); -template void SlowWrite7(u32, u16); -template void SlowWrite7(u32, u8); - -template u32 SlowRead7(u32); -template u16 SlowRead7(u32); -template u8 SlowRead7(u32); - -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +#define INSTANTIATE_SLOWMEM(consoleType) \ + template void SlowWrite9(u32, ARMv5*, u32); \ + template void SlowWrite9(u32, ARMv5*, u16); \ + template void SlowWrite9(u32, ARMv5*, u8); \ + \ + template u32 SlowRead9(u32, ARMv5*); \ + template u16 SlowRead9(u32, ARMv5*); \ + template u8 SlowRead9(u32, ARMv5*); \ + \ + template void SlowWrite7(u32, u32); \ + template void SlowWrite7(u32, u16); \ + template void SlowWrite7(u32, u8); \ + \ + template u32 SlowRead7(u32); \ + template u16 SlowRead7(u32); \ + template u8 SlowRead7(u32); \ + \ + template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); \ + template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); \ + +INSTANTIATE_SLOWMEM(0) +INSTANTIATE_SLOWMEM(1) template struct UnreliableHashTable @@ -616,6 +631,12 @@ void CompileBlock(ARM* cpu) u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; auto existingBlockIt = map.find(blockAddr); if (existingBlockIt != map.end()) @@ -623,18 +644,24 @@ void CompileBlock(ARM* cpu) // there's already a block, though it's not inside the fast map // could be that there are two blocks at the same physical addr // but different mirrors - u32 localAddr = existingBlockIt->second->StartAddrLocal; + u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal; - u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; - *entry = ((u64)blockAddr | cpu->Num) << 32; - *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); - return; - } + if (localAddr == otherLocalAddr) + { + JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr); - u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); - if (!localAddr) - { - printf("trying to compile non executable code? %x\n", blockAddr); + u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + // some memory has been remapped + JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second); + if (prevBlock) + delete prevBlock; + + map.erase(existingBlockIt); } FetchedInstr instrs[Config::JIT_MaxBlockSize]; @@ -655,7 +682,7 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -678,7 +705,7 @@ void CompileBlock(ARM* cpu) instrValues[i] = instrs[i].Instr; u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); - assert(translatedAddr); + assert(translatedAddr >> 27); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { @@ -727,7 +754,10 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; - if (instrs[i].Info.DstRegs & (1 << 14)) + if (instrs[i].Info.DstRegs & (1 << 14) + || (!thumb + && (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG) + && instrs[i].Instr & (1 << 16))) hasLink = false; if (thumb) @@ -792,7 +822,7 @@ void CompileBlock(ARM* cpu) i--; } - if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations) { bool hasBranched = cpu->R[15] != r15; @@ -830,8 +860,6 @@ void CompileBlock(ARM* cpu) } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { - u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); - if (link) { lr = linkAddr; @@ -927,6 +955,8 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 1, 0xF); block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); + + JIT_DEBUGPRINT("block start %p\n", block->EntryPoint); } else { @@ -940,12 +970,12 @@ void CompileBlock(ARM* cpu) assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); - AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; + AddressRange* region = CodeMemRegions[addressRanges[j] >> 27]; - if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) - ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); + if (!PageContainsCode(®ion[(addressRanges[j] & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true); - AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; + AddressRange* range = ®ion[(addressRanges[j] & 0x7FFFFFF) / 512]; range->Code |= addressMasks[j]; range->Blocks.Add(block); } @@ -955,7 +985,7 @@ void CompileBlock(ARM* cpu) else JitBlocks7[blockAddr] = block; - u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; + u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2]; *entry = ((u64)blockAddr | cpu->Num) << 32; *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } @@ -964,8 +994,8 @@ void InvalidateByAddr(u32 localAddr) { JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); - AddressRange* region = CodeMemRegions[localAddr >> 28]; - AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; + AddressRange* region = CodeMemRegions[localAddr >> 27]; + AddressRange* range = ®ion[(localAddr & 0x7FFFFFF) / 512]; u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; @@ -994,9 +1024,9 @@ void InvalidateByAddr(u32 localAddr) range->Blocks.Remove(i); if (range->Blocks.Length == 0 - && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) + && !PageContainsCode(®ion[(localAddr & 0x7FFF000) / 512])) { - ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); + ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false); } bool literalInvalidation = false; @@ -1019,8 +1049,8 @@ void InvalidateByAddr(u32 localAddr) u32 addr = block->AddressRanges()[j]; if ((addr / 512) != (localAddr / 512)) { - AddressRange* otherRegion = CodeMemRegions[addr >> 28]; - AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; + AddressRange* otherRegion = CodeMemRegions[addr >> 27]; + AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512]; assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); @@ -1028,15 +1058,15 @@ void InvalidateByAddr(u32 localAddr) if (otherRange->Blocks.Length == 0) { - if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) - ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); + if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false); otherRange->Code = 0; } } } - FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; + FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) JitBlocks9.erase(block->StartAddr); else @@ -1055,19 +1085,25 @@ void InvalidateByAddr(u32 localAddr) } } -template -void CheckAndInvalidate(u32 addr) +void CheckAndInvalidateITCM() { - // let's hope this gets all properly inlined - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) + for (u32 i = 0; i < ITCMPhysicalSize; i+=16) { - u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; - if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) - InvalidateByAddr(localAddr | (region << 28)); + if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16))) + { + InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27)); + } } } +template +void CheckAndInvalidate(u32 addr) +{ + u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr); + if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr); +} + JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) { u64* entry = &entries[offset / 2]; @@ -1076,35 +1112,44 @@ JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) return NULL; } +void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry) +{ + u32 localAddr = LocaliseCodeAddress(num, blockAddr); + assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry); +} + bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { + // amazingly ignoring the DTCM is the proper behaviour for code fetches int region = num == 0 ? ARMJIT_Memory::ClassifyAddress9(blockAddr) : ARMJIT_Memory::ClassifyAddress7(blockAddr); - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (CodeMemRegions[region] - && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, - mappingSize, memoryOffset, memorySize)) + u32 memoryOffset; + if (FastBlockLookupRegions[region] + && ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size)) { + //printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset); entry = FastBlockLookupRegions[region] + memoryOffset / 2; - // evil, though it should work for everything except DTCM which is not relevant here - start = blockAddr & ~(memorySize - 1); - size = memorySize; return true; } - else - return false; + return false; } template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); -template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); -template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); void ResetBlockCache() { @@ -1133,7 +1178,7 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } @@ -1145,7 +1190,7 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 2320b7b..04add59 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -16,6 +16,8 @@ void DeInit(); void Reset(); +void CheckAndInvalidateITCM(); + void InvalidateByAddr(u32 pseudoPhysical); template diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index b307d0e..c1b23a7 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -168,7 +168,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget))) + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) { ptrdiff_t memopStart = GetCodeOffset(); LoadStorePatch patch; @@ -461,7 +461,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); bool compileFastPath = Config::JIT_FastMemory - && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget)); + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); if (decrement) { diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 19684c4..c87e1b3 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -214,13 +214,13 @@ u32 LocaliseCodeAddress(u32 num, u32 addr); template void LinkBlock(ARM* cpu, u32 codeOffset); -template T SlowRead9(u32 addr, ARMv5* cpu); -template void SlowWrite9(u32 addr, ARMv5* cpu, T val); -template T SlowRead7(u32 addr); -template void SlowWrite7(u32 addr, T val); +template T SlowRead9(u32 addr, ARMv5* cpu); +template void SlowWrite9(u32 addr, ARMv5* cpu, T val); +template T SlowRead7(u32 addr); +template void SlowWrite7(u32 addr, T val); -template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); } diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp index 162827d..0276c65 100644 --- a/src/ARMJIT_Memory.cpp +++ b/src/ARMJIT_Memory.cpp @@ -1,5 +1,7 @@ -#ifdef __SWITCH__ +#if defined(__SWITCH__) #include "switch/compat_switch.h" +#elif defined(_WIN32) +#include #endif #include "ARMJIT_Memory.h" @@ -7,6 +9,7 @@ #include "ARMJIT_Internal.h" #include "ARMJIT_Compiler.h" +#include "DSi.h" #include "GPU.h" #include "GPU3D.h" #include "Wifi.h" @@ -37,66 +40,24 @@ namespace ARMJIT_Memory { -#ifdef __aarch64__ -struct FaultDescription -{ - u64 IntegerRegisters[33]; - u64 FaultAddr; - - u32 GetEmulatedAddr() - { - // now this is podracing - return (u32)IntegerRegisters[0]; - } - u64 RealAddr() - { - return FaultAddr; - } - - u64 GetPC() - { - return IntegerRegisters[32]; - } - - void RestoreAndRepeat(s64 offset); -}; -#else struct FaultDescription { - u64 GetPC() - { - return 0; - } - - u32 GetEmulatedAddr() - { - return 0; - } - u64 RealAddr() - { - return 0; - } - - void RestoreAndRepeat(s64 offset); + u32 EmulatedFaultAddr; + u64 FaultPC; }; -#endif -void FaultHandler(FaultDescription* faultDesc); +bool FaultHandler(FaultDescription* faultDesc, s32& offset); } - -#ifdef __aarch64__ - -extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); - -#endif - -#ifdef __SWITCH__ +#if defined(__SWITCH__) // with LTO the symbols seem to be not properly overriden // if they're somewhere else extern "C" { + +void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + extern char __start__; extern char __rodata_start; @@ -106,57 +67,85 @@ u64 __nx_exception_stack_size = 0x8000; void __libnx_exception_handler(ThreadExceptionDump* ctx) { ARMJIT_Memory::FaultDescription desc; - memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29); - desc.IntegerRegisters[29] = ctx->fp.x; - desc.IntegerRegisters[30] = ctx->lr.x; - desc.IntegerRegisters[31] = ctx->sp.x; - desc.IntegerRegisters[32] = ctx->pc.x; + desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w; + desc.FaultPC = ctx->pc.x; + + u64 integerRegisters[33]; + memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29); + integerRegisters[29] = ctx->fp.x; + integerRegisters[30] = ctx->lr.x; + integerRegisters[31] = ctx->sp.x; + integerRegisters[32] = ctx->pc.x; + + s32 offset; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + integerRegisters[32] += offset; - ARMJIT_Memory::FaultHandler(&desc); + ARM_RestoreContext(integerRegisters); + } if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) { - printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); } else { - printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); } } } + +#elif defined(_WIN32) + +static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) +{ + if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION) + return EXCEPTION_CONTINUE_SEARCH; + + ARMJIT_Memory::FaultDescription desc; + desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx; + desc.FaultPC = exceptionInfo->ContextRecord->Rip; + + s32 offset = 0; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + exceptionInfo->ContextRecord->Rip += offset; + return EXCEPTION_CONTINUE_EXECUTION; + } + + return EXCEPTION_CONTINUE_SEARCH; +} + #endif namespace ARMJIT_Memory { -#ifdef __aarch64__ -void FaultDescription::RestoreAndRepeat(s64 offset) -{ - IntegerRegisters[32] += offset; +void* FastMem9Start, *FastMem7Start; - ARM_RestoreContext(IntegerRegisters); +#ifdef _WIN32 +inline u32 RoundUp(u32 size) +{ + return (size + 0xFFFF) & ~0xFFFF; } #else -void FaultDescription::RestoreAndRepeat(s64 offset) +inline u32 RoundUp(u32 size) { - + return size; } #endif -void* FastMem9Start, *FastMem7Start; - -const u32 MemoryTotalSize = - NDS::MainRAMSize - + NDS::SharedWRAMSize - + NDS::ARM7WRAMSize - + DTCMPhysicalSize; - const u32 MemBlockMainRAMOffset = 0; -const u32 MemBlockSWRAMOffset = NDS::MainRAMSize; -const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize; -const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize; +const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize); +const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize); +const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize); +const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize); +const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize); +const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize); +const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize); const u32 OffsetsPerRegion[memregions_Count] = { @@ -173,6 +162,11 @@ const u32 OffsetsPerRegion[memregions_Count] = UINT32_MAX, UINT32_MAX, UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockNWRAM_AOffset, + MemBlockNWRAM_BOffset, + MemBlockNWRAM_COffset }; enum @@ -186,11 +180,13 @@ enum u8 MappingStatus9[1 << (32-12)]; u8 MappingStatus7[1 << (32-12)]; -#ifdef __SWITCH__ +#if defined(__SWITCH__) u8* MemoryBase; u8* MemoryBaseCodeMem; -#else +#elif defined(_WIN32) u8* MemoryBase; +HANDLE MemoryFile; +LPVOID ExceptionHandlerHandle; #endif bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) @@ -200,6 +196,9 @@ bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size)); return R_SUCCEEDED(r); +#elif defined(_WIN32) + bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst; + return r; #endif } @@ -209,8 +208,24 @@ bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) #ifdef __SWITCH__ Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size); - printf("%x\n", r); return R_SUCCEEDED(r); +#else + return UnmapViewOfFile(dst); +#endif +} + +void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#if defined(_WIN32) + DWORD winProtection, oldProtection; + if (protection == 0) + winProtection = PAGE_NOACCESS; + else if (protection == 1) + winProtection = PAGE_READONLY; + else + winProtection = PAGE_READWRITE; + VirtualProtect(dst, size, winProtection, &oldProtection); #endif } @@ -230,7 +245,6 @@ struct Mapping if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) { offset += NDS::ARM9->DTCMSize; - printf("%x skip\n", NDS::ARM9->DTCMSize); } else { @@ -245,6 +259,7 @@ struct Mapping offset += 0x1000; } +#ifdef __SWITCH__ if (status == memstate_MappedRW) { u32 segmentSize = offset - segmentOffset; @@ -252,8 +267,12 @@ struct Mapping bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); assert(success); } +#endif } } +#if defined(_WIN32) + UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); +#endif } }; ARMJIT::TinyVector Mappings[memregions_Count]; @@ -268,6 +287,8 @@ void SetCodeProtection(int region, u32 offset, bool protect) Mapping& mapping = Mappings[region][i]; u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size) + continue; if (mapping.Num == 0 && region != memregion_DTCM && effectiveAddr >= NDS::ARM9->DTCMBase @@ -276,16 +297,20 @@ void SetCodeProtection(int region, u32 offset, bool protect) u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); - printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num); + printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]); assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; +#if defined(__SWITCH__) bool success; if (protect) success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); else success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); assert(success); +#elif defined(_WIN32) + SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2); +#endif } } @@ -314,8 +339,8 @@ void RemapDTCM(u32 newBase, u32 newSize) printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); - bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end)); - bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end)); + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start); + bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start); if (mapping.Num == 0 && (oldOverlap || newOverlap)) { @@ -336,19 +361,50 @@ void RemapDTCM(u32 newBase, u32 newSize) Mappings[memregion_DTCM].Clear(); } +void RemapNWRAM(int num) +{ + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;) + { + Mapping& mapping = Mappings[memregion_SharedWRAM][i]; + if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size + || DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr)) + { + mapping.Unmap(memregion_SharedWRAM); + Mappings[memregion_SharedWRAM].Remove(i); + } + else + { + i++; + } + } + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num); + } + Mappings[memregion_NewSharedWRAM_A + num].Clear(); +} + void RemapSWRAM() { printf("remapping SWRAM\n"); - for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++) + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++) { - Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM); + Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM); } - Mappings[memregion_SWRAM].Clear(); + Mappings[memregion_SharedWRAM].Clear(); for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) { Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); } Mappings[memregion_WRAM7].Clear(); + for (int j = 0; j < 3; j++) + { + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j); + } + Mappings[memregion_NewSharedWRAM_A + j].Clear(); + } } bool MapAtAddress(u32 addr) @@ -359,33 +415,36 @@ bool MapAtAddress(u32 addr) ? ClassifyAddress9(addr) : ClassifyAddress7(addr); - if (!IsMappable(region)) + if (!IsFastmemCompatible(region)) return false; - u32 mappingStart, mappingSize, memoryOffset, memorySize; - bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize); + return false; + u32 mirrorStart, mirrorSize, memoryOffset; + bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize); if (!isMapped) return false; - // this calculation even works with DTCM - // which doesn't have to be aligned to it's own size - u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart; - u8* states = num == 0 ? MappingStatus9 : MappingStatus7; - printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset); + printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num); bool isExecutable = ARMJIT::CodeMemRegions[region]; - ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset; +#if defined(_WIN32) + bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); + assert(succeded); +#endif + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512; // this overcomplicated piece of code basically just finds whole pieces of code memory // which can be mapped u32 offset = 0; bool skipDTCM = num == 0 && region != memregion_DTCM; - while (offset < memorySize) + while (offset < mirrorSize) { if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) { + SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0); offset += NDS::ARM9->DTCMSize; } else @@ -393,7 +452,7 @@ bool MapAtAddress(u32 addr) u32 sectionOffset = offset; bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) - && offset < memorySize + && offset < mirrorSize && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) { assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); @@ -403,41 +462,49 @@ bool MapAtAddress(u32 addr) u32 sectionSize = offset - sectionOffset; +#if defined(__SWITCH__) if (!hasCode) { printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); assert(succeded); } +#elif defined(_WIN32) + if (hasCode) + { + SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1); + } +#endif } } - Mapping mapping{mirrorStart, memorySize, memoryOffset, num}; + assert(num == 0 || num == 1); + Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num}; Mappings[region].Add(mapping); - printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1); + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1); return true; } -void FaultHandler(FaultDescription* faultDesc) +bool FaultHandler(FaultDescription* faultDesc, s32& offset) { - if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC())) + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC)) { bool rewriteToSlowPath = true; - u32 addr = faultDesc->GetEmulatedAddr(); + u32 addr = faultDesc->EmulatedFaultAddr; if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) - rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr()); + rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr); - s64 offset = 0; if (rewriteToSlowPath) { - offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC()); + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC); } - faultDesc->RestoreAndRepeat(offset); + return true; } + return false; } void Init() @@ -459,18 +526,34 @@ void Init() FastMem7Start = virtmemReserve(0x100000000); assert(FastMem7Start); - NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset; - NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset; - NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset; - NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset; -#else - MemoryBase = new u8[MemoryTotalSize]; + u8* basePtr = MemoryBaseCodeMem; +#elif defined(_WIN32) + ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler); + + MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL); - NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset; - NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset; - NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset; - NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset; + MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE); + + FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + + // only free them after they have all been reserved + // so they can't overlap + VirtualFree(MemoryBase, 0, MEM_RELEASE); + VirtualFree(FastMem9Start, 0, MEM_RELEASE); + VirtualFree(FastMem7Start, 0, MEM_RELEASE); + + MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase); + + u8* basePtr = MemoryBase; #endif + NDS::MainRAM = basePtr + MemBlockMainRAMOffset; + NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset; + NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset; + DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset; + DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset; + DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset; } void DeInit() @@ -482,8 +565,11 @@ void DeInit() svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); free(MemoryBase); -#else - delete[] MemoryBase; +#elif defined(_WIN32) + assert(UnmapViewOfFile(MemoryBase)); + CloseHandle(MemoryFile); + + RemoveVectoredExceptionHandler(ExceptionHandlerHandle); #endif } @@ -505,12 +591,23 @@ void Reset() printf("done resetting jit mem\n"); } -bool IsMappable(int region) +bool IsFastmemCompatible(int region) { +#ifdef _WIN32 + /* + TODO: with some hacks, the smaller shared WRAM regions + could be mapped in some occaisons as well + */ + if (region == memregion_DTCM + || region == memregion_SharedWRAM + || region == memregion_NewSharedWRAM_B + || region == memregion_NewSharedWRAM_C) + return false; +#endif return OffsetsPerRegion[region] != UINT32_MAX; } -bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize) +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize) { memoryOffset = 0; switch (region) @@ -518,137 +615,251 @@ bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, case memregion_ITCM: if (num == 0) { - mappingStart = 0; - mappingSize = NDS::ARM9->ITCMSize; - memorySize = ITCMPhysicalSize; + mirrorStart = addr & ~(ITCMPhysicalSize - 1); + mirrorSize = ITCMPhysicalSize; return true; } return false; - case memregion_DTCM: + case memregion_MainRAM: + mirrorStart = addr & ~NDS::MainRAMMask; + mirrorSize = NDS::MainRAMMask + 1; + return true; + case memregion_BIOS9: if (num == 0) { - mappingStart = NDS::ARM9->DTCMBase; - mappingSize = NDS::ARM9->DTCMSize; - memorySize = DTCMPhysicalSize; + mirrorStart = addr & ~0xFFF; + mirrorSize = 0x1000; return true; } return false; - case memregion_BIOS9: - if (num == 0) + case memregion_BIOS7: + if (num == 1) { - mappingStart = 0xFFFF0000; - mappingSize = 0x10000; - memorySize = 0x1000; + mirrorStart = 0; + mirrorSize = 0x4000; return true; } return false; - case memregion_MainRAM: - mappingStart = 0x2000000; - mappingSize = 0x1000000; - memorySize = NDS::MainRAMSize; - return true; - case memregion_SWRAM: - mappingStart = 0x3000000; + case memregion_SharedWRAM: if (num == 0 && NDS::SWRAM_ARM9.Mem) { - mappingSize = 0x1000000; + mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask; + mirrorSize = NDS::SWRAM_ARM9.Mask + 1; memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; - memorySize = NDS::SWRAM_ARM9.Mask + 1; return true; } else if (num == 1 && NDS::SWRAM_ARM7.Mem) { - mappingSize = 0x800000; + mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask; + mirrorSize = NDS::SWRAM_ARM7.Mask + 1; memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; - memorySize = NDS::SWRAM_ARM7.Mask + 1; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1); + mirrorSize = NDS::ARM7WRAMSize; return true; } return false; case memregion_VRAM: if (num == 0) { - // this is a gross simplification - // mostly to make code on vram working - // it doesn't take any of the actual VRAM mappings into account - mappingStart = 0x6000000; - mappingSize = 0x1000000; - memorySize = 0x100000; - return true; + mirrorStart = addr & ~0xFFFFF; + mirrorSize = 0x100000; } return false; - case memregion_BIOS7: + case memregion_VWRAM: if (num == 1) { - mappingStart = 0; - mappingSize = 0x4000; - memorySize = 0x4000; + mirrorStart = addr & ~0x3FFFF; + mirrorSize = 0x40000; return true; } return false; - case memregion_WRAM7: - if (num == 1) + case memregion_NewSharedWRAM_A: { - if (NDS::SWRAM_ARM7.Mem) + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) { - mappingStart = 0x3800000; - mappingSize = 0x800000; + memoryOffset = ptr - DSi::NWRAM_A; + mirrorStart = addr & ~0xFFFF; + mirrorSize = 0x10000; + return true; } - else + return false; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) { - mappingStart = 0x3000000; - mappingSize = 0x1000000; + memoryOffset = ptr - DSi::NWRAM_B; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; } - memorySize = NDS::ARM7WRAMSize; + return false; // zero filled memory + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + { + memoryOffset = ptr - DSi::NWRAM_C; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; + } + return false; // zero filled memory + } + case memregion_BIOS9DSi: + if (num == 0) + { + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000; return true; } return false; - case memregion_VWRAM: + case memregion_BIOS7DSi: if (num == 1) { - mappingStart = 0x6000000; - mappingSize = 0x1000000; - memorySize = 0x20000; + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000; return true; } return false; default: - // for the JIT we don't are about the rest + assert(false && "For the time being this should only be used for code"); return false; } } +u32 LocaliseAddress(int region, u32 num, u32 addr) +{ + switch (region) + { + case memregion_ITCM: + return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27); + case memregion_MainRAM: + return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27); + case memregion_BIOS9: + return (addr & 0xFFF) | (memregion_BIOS9 << 27); + case memregion_BIOS7: + return (addr & 0x3FFF) | (memregion_BIOS7 << 27); + case memregion_SharedWRAM: + if (num == 0) + return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + else + return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + case memregion_WRAM7: + return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27); + case memregion_VRAM: + // TODO: take mapping properly into account + return (addr & 0xFFFFF) | (memregion_VRAM << 27); + case memregion_VWRAM: + // same here + return (addr & 0x3FFFF) | (memregion_VWRAM << 27); + case memregion_NewSharedWRAM_A: + { + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) + return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27); + else + return memregion_Other << 27; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) + return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27); + else + return memregion_Other << 27; + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27); + else + return memregion_Other << 27; + } + case memregion_BIOS9DSi: + case memregion_BIOS7DSi: + return (addr & 0xFFFF) | (region << 27); + default: + assert(false && "This should only be needed for regions which can contain code"); + return memregion_Other << 27; + } +} + int ClassifyAddress9(u32 addr) { if (addr < NDS::ARM9->ITCMSize) + { return memregion_ITCM; + } else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + { return memregion_DTCM; - else if ((addr & 0xFFFFF000) == 0xFFFF0000) - return memregion_BIOS9; - else + } + else { + if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1))) + { + if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0))) + return memregion_Other; + + return memregion_BIOS9DSi; + } + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + { + return memregion_BIOS9; + } + switch (addr & 0xFF000000) { case 0x02000000: return memregion_MainRAM; case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2]) + return memregion_NewSharedWRAM_C; + } + if (NDS::SWRAM_ARM9.Mem) - return memregion_SWRAM; - else - return memregion_Other; + return memregion_SharedWRAM; + return memregion_Other; case 0x04000000: return memregion_IO9; case 0x06000000: return memregion_VRAM; + default: + return memregion_Other; } } - return memregion_Other; } int ClassifyAddress7(u32 addr) { - if (addr < 0x00004000) + if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9))) + { + if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8)) + return memregion_Other; + + return memregion_BIOS7DSi; + } + else if (addr < 0x00004000) + { return memregion_BIOS7; + } else { switch (addr & 0xFF800000) @@ -657,10 +868,19 @@ int ClassifyAddress7(u32 addr) case 0x02800000: return memregion_MainRAM; case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2]) + return memregion_NewSharedWRAM_C; + } + if (NDS::SWRAM_ARM7.Mem) - return memregion_SWRAM; - else - return memregion_WRAM7; + return memregion_SharedWRAM; + return memregion_WRAM7; case 0x03800000: return memregion_WRAM7; case 0x04000000: @@ -740,14 +960,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } } - switch (size | store) + if (NDS::ConsoleType == 0) + { + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + else { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; - case 16: return (void*)NDS::ARM9IORead16; - case 17: return (void*)NDS::ARM9IOWrite16; - case 32: return (void*)NDS::ARM9IORead32; - case 33: return (void*)NDS::ARM9IOWrite32; + switch (size | store) + { + case 8: return (void*)DSi::ARM9IORead8; + case 9: return (void*)DSi::ARM9IOWrite8; + case 16: return (void*)DSi::ARM9IORead16; + case 17: return (void*)DSi::ARM9IOWrite16; + case 32: return (void*)DSi::ARM9IORead32; + case 33: return (void*)DSi::ARM9IOWrite32; + } } break; case 0x06000000: @@ -781,14 +1016,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } } - switch (size | store) + if (NDS::ConsoleType == 0) { - case 8: return (void*)NDS::ARM7IORead8; - case 9: return (void*)NDS::ARM7IOWrite8; - case 16: return (void*)NDS::ARM7IORead16; - case 17: return (void*)NDS::ARM7IOWrite16; - case 32: return (void*)NDS::ARM7IORead32; - case 33: return (void*)NDS::ARM7IOWrite32; + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + } + else + { + switch (size | store) + { + case 8: return (void*)DSi::ARM7IORead8; + case 9: return (void*)DSi::ARM7IOWrite8; + case 16: return (void*)DSi::ARM7IORead16; + case 17: return (void*)DSi::ARM7IOWrite16; + case 32: return (void*)DSi::ARM7IORead32; + case 33: return (void*)DSi::ARM7IOWrite32; + } } break; case 0x04800000: diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h index 1a59d98..123e18e 100644 --- a/src/ARMJIT_Memory.h +++ b/src/ARMJIT_Memory.h @@ -23,7 +23,7 @@ enum memregion_DTCM, memregion_BIOS9, memregion_MainRAM, - memregion_SWRAM, + memregion_SharedWRAM, memregion_IO9, memregion_VRAM, memregion_BIOS7, @@ -31,18 +31,28 @@ enum memregion_IO7, memregion_Wifi, memregion_VWRAM, + + // DSi + memregion_BIOS9DSi, + memregion_BIOS7DSi, + memregion_NewSharedWRAM_A, + memregion_NewSharedWRAM_B, + memregion_NewSharedWRAM_C, + memregions_Count }; int ClassifyAddress9(u32 addr); int ClassifyAddress7(u32 addr); -bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize); +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize); +u32 LocaliseAddress(int region, u32 num, u32 addr); -bool IsMappable(int region); +bool IsFastmemCompatible(int region); void RemapDTCM(u32 newBase, u32 newSize); void RemapSWRAM(); +void RemapNWRAM(int num); void SetCodeProtection(int region, u32 offset, bool protect); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 34c1c91..d8bdd56 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -40,6 +40,12 @@ const int RegisterCache::NativeRegsAvailable = #endif ; +#ifdef _WIN32 +const BitSet32 CallerSavedPushRegs({R10, R11}); +#else +const BitSet32 CallerSavedPushRegs({R9, R10, R11}); +#endif + void Compiler::PushRegs(bool saveHiRegs) { BitSet32 loadedRegs(RegCache.LoadedRegs); @@ -301,6 +307,107 @@ Compiler::Compiler() RET(); } + for (int consoleType = 0; consoleType < 2; consoleType++) + { + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 16; reg++) + { + if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3) + { + PatchedStoreFuncs[consoleType][num][size][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL; + continue; + } + + X64Reg rdMapped = (X64Reg)reg; + PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + MOV(32, R(ABI_PARAM3), R(rdMapped)); + } + else + { + MOV(32, R(ABI_PARAM2), R(rdMapped)); + } + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9); break; + case 33: ABI_CallFunction(SlowWrite7); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 17: ABI_CallFunction(SlowWrite7); break; + case 8: ABI_CallFunction(SlowWrite9); break; + case 9: ABI_CallFunction(SlowWrite7); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9); break; + case 33: ABI_CallFunction(SlowWrite7); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 17: ABI_CallFunction(SlowWrite7); break; + case 8: ABI_CallFunction(SlowWrite9); break; + case 9: ABI_CallFunction(SlowWrite7); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + MOV(64, R(ABI_PARAM2), R(RCPU)); + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9); break; + case 33: ABI_CallFunction(SlowRead7); break; + case 16: ABI_CallFunction(SlowRead9); break; + case 17: ABI_CallFunction(SlowRead7); break; + case 8: ABI_CallFunction(SlowRead9); break; + case 9: ABI_CallFunction(SlowRead7); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9); break; + case 33: ABI_CallFunction(SlowRead7); break; + case 16: ABI_CallFunction(SlowRead9); break; + case 17: ABI_CallFunction(SlowRead7); break; + case 8: ABI_CallFunction(SlowRead9); break; + case 9: ABI_CallFunction(SlowRead7); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (signextend) + MOVSX(32, 8 << size, rdMapped, R(RSCRATCH)); + else + MOVZX(32, 8 << size, rdMapped, R(RSCRATCH)); + RET(); + } + } + } + } + } + // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -500,6 +607,8 @@ void Compiler::Reset() NearCode = NearStart; FarCode = FarStart; + + LoadStorePatches.clear(); } bool Compiler::IsJITFault(u64 addr) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index d1a6c07..0fe0147 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -7,6 +7,8 @@ #include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" +#include + namespace ARMJIT { @@ -18,6 +20,13 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; const Gen::X64Reg RSCRATCH4 = Gen::R8; +struct LoadStorePatch +{ + void* PatchFunc; + s16 Offset; + u16 Size; +}; + struct Op2 { Op2() @@ -211,6 +220,11 @@ public: u8* NearStart; u8* FarStart; + void* PatchedStoreFuncs[2][2][3][16]; + void* PatchedLoadFuncs[2][2][3][2][16]; + + std::unordered_map LoadStorePatches; + u8* ResetStart; u32 CodeMemSize; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b780c55..2da113b 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -17,7 +17,30 @@ int squeezePointer(T* ptr) s32 Compiler::RewriteMemAccess(u64 pc) { - return 0; + auto it = LoadStorePatches.find((u8*)pc); + if (it != LoadStorePatches.end()) + { + LoadStorePatch patch = it->second; + LoadStorePatches.erase(it); + + u8* curCodePtr = GetWritableCodePtr(); + u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset; + SetCodePtr(rewritePtr); + + CALL(patch.PatchFunc); + u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr); + if (remainingSize > 0) + NOP(remainingSize); + + //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size); + + SetCodePtr(curCodePtr); + + return patch.Offset; + } + + printf("this is a JIT bug %x\n", pc); + abort(); } /* @@ -91,369 +114,213 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag return; } + if (flags & memop_Store) { - if (flags & memop_Store) - { - Comp_AddCycles_CD(); - } - else - { - Comp_AddCycles_CDI(); - } + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - bool addrIsStatic = Config::JIT_LiteralOptimisations - && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); - u32 staticAddress; - if (addrIsStatic) - staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - OpArg rdMapped = MapReg(rd); + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + OpArg rdMapped = MapReg(rd); - if (true) - { - OpArg rnMapped = MapReg(rn); - if (Thumb && rn == 15) - rnMapped = Imm32(R15 & ~0x2); + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); - X64Reg finalAddr = RSCRATCH3; - if (flags & memop_Post) - { - MOV(32, R(RSCRATCH3), rnMapped); + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) + { + MOV(32, R(RSCRATCH3), rnMapped); - finalAddr = rnMapped.GetSimpleReg(); - } + finalAddr = rnMapped.GetSimpleReg(); + } - if (op2.IsImm) + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) { - MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); } else - { - OpArg rm = MapReg(op2.Reg.Reg); + MOV_sum(32, finalAddr, rnMapped, offset); + } + } - if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() - && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) - { - LEA(32, finalAddr, - MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); - } - else - { - bool throwAway; - OpArg offset = - Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); - if (flags & memop_SubtractOffset) - { - if (R(finalAddr) != rnMapped) - MOV(32, R(finalAddr), rnMapped); - if (!offset.IsZero()) - SUB(32, R(finalAddr), offset); - } - else - MOV_sum(32, finalAddr, rnMapped, offset); - } - } + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); - if ((flags & memop_Writeback) && !(flags & memop_Post)) - MOV(32, rnMapped, R(finalAddr)); - } + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) + { + u8* memopStart = GetWritableCodePtr(); + LoadStorePatch patch; + + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()] + : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()]; - /*int expectedTarget = Num == 0 - ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) - : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - if (CurInstr.Cond() < 0xE) - expectedTarget = memregion_Other; + assert(patch.PatchFunc != NULL); - bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store); + MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); - switch (expectedTarget) + X64Reg maskedAddr = RSCRATCH3; + if (size > 8) { - case memregion_MainRAM: - case memregion_DTCM: - case memregion_WRAM7: - case memregion_SWRAM9: - case memregion_SWRAM7: - case memregion_IO9: - case memregion_IO7: - case memregion_VWRAM: - compileFastPath = true; - break; - case memregion_Wifi: - compileFastPath = size >= 16; - break; - case memregion_VRAM: - compileFastPath = !(flags & memop_Store) || size >= 16; - case memregion_BIOS9: - compileFastPath = !(flags & memop_Store); - break; - default: break; + maskedAddr = RSCRATCH2; + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + AND(32, R(RSCRATCH2), Imm8(addressMask)); } - if (addrIsStatic && !compileFastPath) + u8* memopLoadStoreLocation = GetWritableCodePtr(); + if (flags & memop_Store) { - compileFastPath = false; - compileSlowPath = true; + MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped); } - - if (addrIsStatic && compileSlowPath) - MOV(32, R(RSCRATCH3), Imm32(staticAddress)); -*/ - /*if (compileFastPath) + else { - FixupBranch slowPath; - if (compileSlowPath) - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - SHR(32, R(RSCRATCH), Imm8(9)); - if (flags & memop_Store) - { - CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); - } - else - { - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); - AND(32, R(RSCRATCH), Imm8(~0x80)); - CMP(32, R(RSCRATCH), Imm8(expectedTarget)); - } - - slowPath = J_CC(CC_NE, true); - } + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); - if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7 - || expectedTarget == memregion_BIOS9) + if (size == 32) { - u8* data; - u32 mask; - if (expectedTarget == memregion_MainRAM) - { - data = NDS::MainRAM; - mask = MAIN_RAM_SIZE - 1; - } - else if (expectedTarget == memregion_BIOS9) - { - data = NDS::ARM9BIOS; - mask = 0xFFF; - } - else - { - data = NDS::ARM7WRAM; - mask = 0xFFFF; - } - OpArg memLoc; - if (addrIsStatic) - { - memLoc = M(data + ((staticAddress & mask & addressMask))); - } - else - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - AND(32, R(RSCRATCH), Imm32(mask & addressMask)); - memLoc = MDisp(RSCRATCH, squeezePointer(data)); - } - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); - } - else if (expectedTarget == memregion_DTCM) - { - if (addrIsStatic) - MOV(32, R(RSCRATCH), Imm32(staticAddress)); - else - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); - OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)); - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); - } - else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7) - { - MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); - if (addrIsStatic) - { - MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask)); - } - else - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - AND(32, R(RSCRATCH), Imm8(addressMask)); - } - AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); - OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2); - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); } - else - { - u32 maskedDataRegion; - - if (addrIsStatic) - { - maskedDataRegion = staticAddress; - MOV(32, R(ABI_PARAM1), Imm32(staticAddress)); - } - else - { - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - AND(32, R(ABI_PARAM1), Imm8(addressMask)); - - maskedDataRegion = CurInstr.DataRegion; - if (Num == 0) - maskedDataRegion &= ~0xFFFFFF; - else - maskedDataRegion &= ~0x7FFFFF; - } + } - void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size); + patch.Offset = memopStart - memopLoadStoreLocation; + patch.Size = GetWritableCodePtr() - memopStart; - if (flags & memop_Store) - { - PushRegs(false); + assert(patch.Size >= 5); - MOV(32, R(ABI_PARAM2), rdMapped); + LoadStorePatches[memopLoadStoreLocation] = patch; + } + else + { + PushRegs(false); - ABI_CallFunction((void(*)())func); + if (Num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); - PopRegs(false); - } - else + switch (size | NDS::ConsoleType) { - if (!addrIsStatic) - MOV(32, rdMapped, R(RSCRATCH3)); - - PushRegs(false); - - ABI_CallFunction((void(*)())func); - - PopRegs(false); - - if (!addrIsStatic) - MOV(32, R(RSCRATCH3), rdMapped); - - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + case 32: CALL((void*)&SlowWrite9); break; + case 16: CALL((void*)&SlowWrite9); break; + case 8: CALL((void*)&SlowWrite9); break; + case 33: CALL((void*)&SlowWrite9); break; + case 17: CALL((void*)&SlowWrite9); break; + case 9: CALL((void*)&SlowWrite9); break; } } - - if ((size == 32 && !(flags & memop_Store))) + else { - if (addrIsStatic) - { - if (staticAddress & 0x3) - ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8)); - } - else + switch (size | NDS::ConsoleType) { - AND(32, R(RSCRATCH3), Imm8(0x3)); - SHL(32, R(RSCRATCH3), Imm8(3)); - ROR_(32, rdMapped, R(RSCRATCH3)); + case 32: CALL((void*)&SlowRead9); break; + case 16: CALL((void*)&SlowRead9); break; + case 8: CALL((void*)&SlowRead9); break; + case 33: CALL((void*)&SlowRead9); break; + case 17: CALL((void*)&SlowRead9); break; + case 9: CALL((void*)&SlowRead9); break; } } - - if (compileSlowPath) - { - SwitchToFarCode(); - SetJumpTarget(slowPath); - } } -*/ - if (true) + else { - PushRegs(false); - - if (Num == 0) + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) { - MOV(64, R(ABI_PARAM2), R(RCPU)); - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - if (flags & memop_Store) - { - MOV(32, R(ABI_PARAM3), rdMapped); + MOV(32, R(ABI_PARAM2), rdMapped); - switch (size) - { - case 32: CALL((void*)&SlowWrite9); break; - case 16: CALL((void*)&SlowWrite9); break; - case 8: CALL((void*)&SlowWrite9); break; - } - } - else + switch (size | NDS::ConsoleType) { - switch (size) - { - case 32: CALL((void*)&SlowRead9); break; - case 16: CALL((void*)&SlowRead9); break; - case 8: CALL((void*)&SlowRead9); break; - } + case 32: CALL((void*)&SlowWrite7); break; + case 16: CALL((void*)&SlowWrite7); break; + case 8: CALL((void*)&SlowWrite7); break; + case 33: CALL((void*)&SlowWrite7); break; + case 17: CALL((void*)&SlowWrite7); break; + case 9: CALL((void*)&SlowWrite7); break; } } else { - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - if (flags & memop_Store) + switch (size | NDS::ConsoleType) { - MOV(32, R(ABI_PARAM2), rdMapped); - - switch (size) - { - case 32: CALL((void*)&SlowWrite7); break; - case 16: CALL((void*)&SlowWrite7); break; - case 8: CALL((void*)&SlowWrite7); break; - } - } - else - { - switch (size) - { - case 32: CALL((void*)&SlowRead7); break; - case 16: CALL((void*)&SlowRead7); break; - case 8: CALL((void*)&SlowRead7); break; - } + case 32: CALL((void*)&SlowRead7); break; + case 16: CALL((void*)&SlowRead7); break; + case 8: CALL((void*)&SlowRead7); break; + case 33: CALL((void*)&SlowRead7); break; + case 17: CALL((void*)&SlowRead7); break; + case 9: CALL((void*)&SlowRead7); break; } } + } - PopRegs(false); + PopRegs(false); - if (!(flags & memop_Store)) - { - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - } - } -/* - if (compileFastPath && compileSlowPath) + if (!(flags & memop_Store)) { - FixupBranch ret = J(true); - SwitchToNearCode(); - SetJumpTarget(ret); - }*/ + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } - if (!(flags & memop_Store) && rd == 15) + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); { - if (size < 32) - printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + if (Num == 1) { - if (Num == 1) - AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rdMapped.GetSimpleReg()); + if (Thumb) + OR(32, rdMapped, Imm8(0x1)); + else + AND(32, rdMapped, Imm8(0xFE)); } + Comp_JumpTo(rdMapped.GetSimpleReg()); } } } @@ -470,7 +337,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc int flags = 0; if (store) flags |= memop_Store; - if (decrement) + if (decrement && preinc) flags |= memop_SubtractOffset; Op2 offset = preinc ? Op2(4) : Op2(0); @@ -481,96 +348,52 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - // we need to make sure that the stack stays aligned to 16 bytes -#ifdef _WIN32 - // include shadow - u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8; -#else - u32 stackAlloc = ((regsCount + 1) & ~1) * 8; -#endif - u32 allocOffset = stackAlloc - regsCount * 8; -/* int expectedTarget = Num == 0 - ? ClassifyAddress9(CurInstr.DataRegion) - : ClassifyAddress7(CurInstr.DataRegion); - if (usermode || CurInstr.Cond() < 0xE) - expectedTarget = memregion_Other; - - bool compileFastPath = false; + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); - switch (expectedTarget) - { - case memregion_DTCM: - case memregion_MainRAM: - case memregion_SWRAM9: - case memregion_SWRAM7: - case memregion_WRAM7: - compileFastPath = true; - break; - default: - break; - } -*/ if (!store) Comp_AddCycles_CDI(); else Comp_AddCycles_CD(); + bool compileFastPath = Config::JIT_FastMemory + && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); + + // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#else + u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; + if (decrement) - { - MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; - } + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4))); else - MOV(32, R(RSCRATCH4), MapReg(rn)); -/* + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0)); + if (compileFastPath) { - assert(!usermode); + AND(32, R(RSCRATCH4), Imm8(~3)); - MOV(32, R(RSCRATCH), R(RSCRATCH4)); - SHR(32, R(RSCRATCH), Imm8(9)); + u8* fastPathStart = GetWritableCodePtr(); + u8* firstLoadStoreAddr; - if (store) - { - CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); - } - else - { - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); - AND(32, R(RSCRATCH), Imm8(~0x80)); - CMP(32, R(RSCRATCH), Imm8(expectedTarget)); - } - FixupBranch slowPath = J_CC(CC_NE, true); + bool firstLoadStore = true; + + MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); + ADD(64, R(RSCRATCH2), R(RSCRATCH4)); + MOV(32, R(RSCRATCH3), R(RSCRATCH4)); - if (expectedTarget == memregion_DTCM) - { - SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3)); - LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM))); - } - else if (expectedTarget == memregion_MainRAM) - { - AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3)); - ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM))); - } - else if (expectedTarget == memregion_WRAM7) - { - AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3)); - ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM))); - } - else // SWRAM - { - AND(32, R(RSCRATCH4), Imm8(~3)); - AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); - ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); - } u32 offset = 0; for (int reg : regs) { - if (preinc) - offset += 4; - OpArg mem = MDisp(RSCRATCH4, offset); + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); + + OpArg mem = MDisp(RSCRATCH2, offset); if (store) { if (RegCache.LoadedRegs & (1 << reg)) @@ -580,6 +403,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc else { LoadReg(reg, RSCRATCH); + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); MOV(32, mem, R(RSCRATCH)); } } @@ -595,13 +420,19 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SaveReg(reg, RSCRATCH); } } - if (!preinc) - offset += 4; + offset += 4; + + firstLoadStore = false; } + LoadStorePatch patch; + patch.Size = GetWritableCodePtr() - fastPathStart; + patch.Offset = fastPathStart - firstLoadStoreAddr; SwitchToFarCode(); - SetJumpTarget(slowPath); - }*/ + patch.PatchFunc = GetWritableCodePtr(); + + LoadStorePatches[firstLoadStoreAddr] = patch; + } if (!store) { @@ -618,12 +449,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (Num == 0) MOV(64, R(ABI_PARAM4), R(RCPU)); - switch (Num * 2 | preinc) + switch (Num * 2 | NDS::ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } PopRegs(false); @@ -715,25 +546,24 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (Num == 0) MOV(64, R(ABI_PARAM4), R(RCPU)); - switch (Num * 2 | preinc) + switch (Num * 2 | NDS::ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); PopRegs(false); } -/* + if (compileFastPath) { - FixupBranch ret = J(true); + RET(); SwitchToNearCode(); - SetJumpTarget(ret); - }*/ + } if (!store && regs[15]) { diff --git a/src/CP15.cpp b/src/CP15.cpp index 3d64259..992c83f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -608,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val) ITCMSetting = val; UpdateITCMSetting(); return; + + case 0xF00: + //printf("cache debug index register %08X\n", val); + return; + + case 0xF10: + //printf("cache debug instruction tag %08X\n", val); + return; + + case 0xF20: + //printf("cache debug data tag %08X\n", val); + return; + + case 0xF30: + //printf("cache debug instruction cache %08X\n", val); + return; + + case 0xF40: + //printf("cache debug data cache %08X\n", val); + return; + } if ((id&0xF00)!=0x700) diff --git a/src/Config.cpp b/src/Config.cpp index edf84f2..de1c70d 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -40,14 +40,7 @@ char DSiNANDPath[1024]; #ifdef JIT_ENABLED int JIT_Enable = false; int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = 2; -int JIT_LiteralOptimisations = true; -#endif - -#ifdef JIT_ENABLED -int JIT_Enable = false; -int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = true; +int JIT_BranchOptimisations = 2; int JIT_LiteralOptimisations = true; int JIT_FastMemory = true; #endif @@ -66,16 +59,9 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, - {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, -#endif - -#ifdef JIT_ENABLED - {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, - {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, - {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0}, + {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 7b19a4b..5916b4a 100644 --- a/src/Config.h +++ b/src/Config.h @@ -54,14 +54,7 @@ extern char DSiNANDPath[1024]; #ifdef JIT_ENABLED extern int JIT_Enable; extern int JIT_MaxBlockSize; -extern int JIT_BrancheOptimisations; -extern int JIT_LiteralOptimisations; -#endif - -#ifdef JIT_ENABLED -extern int JIT_Enable; -extern int JIT_MaxBlockSize; -extern int JIT_BrancheOptimisations; +extern int JIT_BranchOptimisations; extern int JIT_LiteralOptimisations; extern int JIT_FastMemory; #endif diff --git a/src/DSi.cpp b/src/DSi.cpp index 216f724..97a63cd 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -26,6 +26,11 @@ #include "NDSCart.h" #include "Platform.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif + #include "DSi_NDMA.h" #include "DSi_I2C.h" #include "DSi_SD.h" @@ -34,15 +39,6 @@ #include "tiny-AES-c/aes.hpp" -namespace NDS -{ - -extern ARMv5* ARM9; -extern ARMv4* ARM7; - -} - - namespace DSi { @@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000]; u32 MBK[2][9]; -u8 NWRAM_A[0x40000]; -u8 NWRAM_B[0x40000]; -u8 NWRAM_C[0x40000]; +u8* NWRAM_A; +u8* NWRAM_B; +u8* NWRAM_C; u8* NWRAMMap_A[2][4]; u8* NWRAMMap_B[3][8]; @@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00]; bool Init() { +#ifndef JIT_ENABLED + NWRAM_A = new u8[NWRAMSize]; + NWRAM_B = new u8[NWRAMSize]; + NWRAM_C = new u8[NWRAMSize]; +#endif + if (!DSi_I2C::Init()) return false; if (!DSi_AES::Init()) return false; @@ -106,6 +108,12 @@ bool Init() void DeInit() { +#ifndef JIT_ENABLED + delete[] NWRAM_A; + delete[] NWRAM_B; + delete[] NWRAM_C; +#endif + DSi_I2C::DeInit(); DSi_AES::DeInit(); @@ -176,7 +184,12 @@ void SoftReset() NDS::ARM9->Reset(); NDS::ARM7->Reset(); + NDS::ARM9->CP15Reset(); + memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000); +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidateITCM(); +#endif DSi_AES::Reset(); @@ -274,9 +287,9 @@ bool LoadNAND() { printf("Loading DSi NAND\n"); - memset(NWRAM_A, 0, 0x40000); - memset(NWRAM_B, 0, 0x40000); - memset(NWRAM_C, 0, 0x40000); + memset(NWRAM_A, 0, NWRAMSize); + memset(NWRAM_B, 0, NWRAMSize); + memset(NWRAM_C, 0, NWRAMSize); memset(MBK, 0, sizeof(MBK)); memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A)); @@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(0); + int mbkn = 0, mbks = 8*num; u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(1); + int mbkn = 1+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(2); + int mbkn = 3+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val) u32 oldval = MBK[cpu][5+num]; if (oldval == val) return; + ARMJIT_Memory::RemapNWRAM(num); + MBK[cpu][5+num] = val; // TODO: what happens when the ranges are 'out of range'???? @@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write8(addr, val); @@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write16(addr, val); @@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write32(addr, val); @@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); +#endif + } return; } return NDS::ARM7Write8(addr, val); @@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write16(addr, val); @@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write32(addr, val); @@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr) case 0x04004501: return DSi_I2C::Cnt; case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF; - case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; + case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF; case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF; case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF; diff --git a/src/DSi.h b/src/DSi.h index 8cc8fd5..40f22bb 100644 --- a/src/DSi.h +++ b/src/DSi.h @@ -25,6 +25,8 @@ namespace DSi { +extern u16 SCFG_BIOS; + extern u8 ARM9iBIOS[0x10000]; extern u8 ARM7iBIOS[0x10000]; @@ -34,6 +36,19 @@ extern u64 ConsoleID; extern DSi_SDHost* SDMMC; extern DSi_SDHost* SDIO; +const u32 NWRAMSize = 0x40000; + +extern u8* NWRAM_A; +extern u8* NWRAM_B; +extern u8* NWRAM_C; + +extern u8* NWRAMMap_A[2][4]; +extern u8* NWRAMMap_B[3][8]; +extern u8* NWRAMMap_C[3][8]; + +extern u32 NWRAMStart[2][3]; +extern u32 NWRAMEnd[2][3]; +extern u32 NWRAMMask[2][3]; bool Init(); void DeInit(); diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp index 9984f5e..e22c708 100644 --- a/src/DSi_I2C.cpp +++ b/src/DSi_I2C.cpp @@ -21,6 +21,7 @@ #include "DSi.h" #include "DSi_I2C.h" #include "DSi_Camera.h" +#include "ARM.h" namespace DSi_BPTWL @@ -108,7 +109,8 @@ void Write(u8 val, bool last) printf("BPTWL: soft-reset\n"); val = 0; // checkme // TODO: soft-reset might need to be scheduled later! - DSi::SoftReset(); + // TODO: this has been moved for the JIT to work, nothing is confirmed here + NDS::ARM7->Halt(4); CurPos = -1; return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 3d65482..6981a42 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -32,8 +32,11 @@ #include "Wifi.h" #include "AREngine.h" #include "Platform.h" + +#ifdef JIT_ENABLED #include "ARMJIT.h" #include "ARMJIT_Memory.h" +#endif #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -173,7 +176,7 @@ bool Init() #ifdef JIT_ENABLED ARMJIT::Init(); #else - MainRAM = new u8[MainRAMSize]; + MainRAM = new u8[0x1000000]; ARM7WRAM = new u8[ARM7WRAMSize]; SharedWRAM = new u8[SharedWRAMSize]; #endif @@ -1837,7 +1840,7 @@ u8 ARM9Read8(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u8*)&MainRAM[addr & (MainRAMSize - 1)]; + return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: if (SWRAM_ARM9.Mem) @@ -1902,7 +1905,7 @@ u16 ARM9Read16(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u16*)&MainRAM[addr & (MainRAMSize - 1)]; + return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: if (SWRAM_ARM9.Mem) @@ -2031,16 +2034,13 @@ void ARM9Write8(u32 addr, u8 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2090,16 +2090,13 @@ void ARM9Write16(u32 addr, u16 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2168,16 +2165,13 @@ void ARM9Write32(u32 addr, u32 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return ; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2235,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val) return; } - printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); + //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); } bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) @@ -2475,16 +2469,13 @@ void ARM7Write8(u32 addr, u8 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; @@ -2552,16 +2543,13 @@ void ARM7Write16(u32 addr, u16 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; @@ -2639,16 +2627,13 @@ void ARM7Write32(u32 addr, u32 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; diff --git a/src/NDS.h b/src/NDS.h index 4b4f9a1..e0a5045 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -165,6 +165,8 @@ extern u16 ARM7BIOSProt; extern u8* MainRAM; extern u32 MainRAMMask; +const u32 MainRAMMaxSize = 0x1000000; + const u32 SharedWRAMSize = 0x8000; extern u8* SharedWRAM; diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp index 09faf4e..9ee7b9a 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp +++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp @@ -32,6 +32,7 @@ EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr; extern char* EmuDirectory; +extern bool RunningSomething; EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog) @@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType); ui->chkDirectBoot->setChecked(Config::DirectBoot != 0); + +#ifdef JIT_ENABLED + ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0); + ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0); + ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0); + ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0); + ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize); +#else + ui->chkEnableJIT->setDisabled(true); + ui->chkJITBranchOptimisations->setDisabled(true); + ui->chkJITLiteralOptimisations->setDisabled(true); + ui->chkJITFastMemory->setDisabled(true); + ui->spnJITMaximumBlockSize->setDisabled(true); +#endif + + on_chkEnableJIT_toggled(); } EmuSettingsDialog::~EmuSettingsDialog() @@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware() } } -void EmuSettingsDialog::on_EmuSettingsDialog_accepted() +void EmuSettingsDialog::done(int r) { - verifyFirmware(); - - strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0'; - strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0'; - strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0'; - - strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; - strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; - strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; - strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; - - Config::ConsoleType = ui->cbxConsoleType->currentIndex(); - Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0; - - Config::Save(); + if (r == QDialog::Accepted) + { + verifyFirmware(); + + int consoleType = ui->cbxConsoleType->currentIndex(); + int directBoot = ui->chkDirectBoot->isChecked() ? 1:0; + + int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0; + int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value(); + int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0; + int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0; + int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0; + + std::string bios9Path = ui->txtBIOS9Path->text().toStdString(); + std::string bios7Path = ui->txtBIOS7Path->text().toStdString(); + std::string firmwarePath = ui->txtFirmwarePath->text().toStdString(); + std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString(); + std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString(); + std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString(); + std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString(); + + if (consoleType != Config::ConsoleType + || directBoot != Config::DirectBoot +#ifdef JIT_ENABLED + || jitEnable != Config::JIT_Enable + || jitMaxBlockSize != Config::JIT_MaxBlockSize + || jitBranchOptimisations != Config::JIT_BranchOptimisations + || jitLiteralOptimisations != Config::JIT_LiteralOptimisations + || jitFastMemory != Config::JIT_FastMemory +#endif + || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0 + || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0 + || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0 + || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0 + || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0 + || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0 + || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0) + { + if (RunningSomething + && QMessageBox::warning(this, "Reset necessary to apply changes", + "The emulation will be reset for the changes to take place", + QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes) + return; + + strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0'; + strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0'; + strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0'; + + strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; + strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; + strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; + strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; + + #ifdef JIT_ENABLED + Config::JIT_Enable = jitEnable; + Config::JIT_MaxBlockSize = jitMaxBlockSize; + Config::JIT_BranchOptimisations = jitBranchOptimisations; + Config::JIT_LiteralOptimisations = jitLiteralOptimisations; + Config::JIT_FastMemory = jitFastMemory; + #endif + + Config::ConsoleType = consoleType; + Config::DirectBoot = directBoot; + + Config::Save(); + } + } - closeDlg(); -} + QDialog::done(r); -void EmuSettingsDialog::on_EmuSettingsDialog_rejected() -{ closeDlg(); } @@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked() ui->txtDSiNANDPath->setText(file); } + +void EmuSettingsDialog::on_chkEnableJIT_toggled() +{ + bool disabled = !ui->chkEnableJIT->isChecked(); + ui->chkJITBranchOptimisations->setDisabled(disabled); + ui->chkJITLiteralOptimisations->setDisabled(disabled); + ui->chkJITFastMemory->setDisabled(disabled); + ui->spnJITMaximumBlockSize->setDisabled(disabled); +} \ No newline at end of file diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h index f604ba5..268036c 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.h +++ b/src/frontend/qt_sdl/EmuSettingsDialog.h @@ -51,8 +51,7 @@ public: } private slots: - void on_EmuSettingsDialog_accepted(); - void on_EmuSettingsDialog_rejected(); + void done(int r); void on_btnBIOS9Browse_clicked(); void on_btnBIOS7Browse_clicked(); @@ -63,6 +62,8 @@ private slots: void on_btnDSiFirmwareBrowse_clicked(); void on_btnDSiNANDBrowse_clicked(); + void on_chkEnableJIT_toggled(); + private: void verifyFirmware(); diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui index 4894fa5..11d48cc 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.ui +++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui @@ -6,8 +6,8 @@ 0 0 - 490 - 392 + 514 + 359 @@ -24,243 +24,336 @@ QLayout::SetFixedSize - - - DS mode + + + 0 - - - - - - 0 - 0 - - - - - 290 - 0 - - - - - - - <html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html> - - - - - - - DS firmware: - - - - - - - DS ARM7 BIOS: - - - - - - - DS ARM9 BIOS: - - - - - - - - 0 - 0 - - - - Browse... - - - true - - - - - - - <html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html> - - - - - - - Browse... - - - - - - - <html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html> - - - - - - - Browse... - - - - - - - - - - DSi mode - - - - - - Browse... - - - - - - - DSi ARM9 BIOS: - - - - - - - Browse... - - - - - - - <html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> - - - - - - - <html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html> - - - - - - - DSi ARM7 BIOS: - - - - - - - DSi firmware: - - - - - - - Browse... - - - - - - - - 0 - 0 - - - - <html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> - - - - - - - DSi NAND: - - - - - - - <html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html> - - - - - - - Browse... - - - - - - - - - - General - - - - - - - 0 - 0 - - - - Console type: - - - - - - - - 0 - 0 - - - - <html><head/><body><p>The type of console to emulate</p></body></html> - - - - - - - <html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html> - - - Boot game directly - - - - + + + General + + + + + + + 0 + 0 + + + + <html><head/><body><p>The type of console to emulate</p></body></html> + + + + + + + <html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html> + + + Boot game directly + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + 0 + 0 + + + + Console type: + + + + + + + + BIOS Files + + + + + + DS mode + + + + + + DS firmware: + + + + + + + <html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html> + + + + + + + <html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html> + + + + + + + + 0 + 0 + + + + Browse... + + + true + + + + + + + Browse... + + + + + + + DS ARM7 BIOS: + + + + + + + DS ARM9 BIOS: + + + + + + + Browse... + + + + + + + + 0 + 0 + + + + + 290 + 0 + + + + + + + <html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html> + + + + + + + + + + DSi mode + + + + + + Browse... + + + + + + + DSi ARM9 BIOS: + + + + + + + Browse... + + + + + + + <html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> + + + + + + + <html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html> + + + + + + + DSi ARM7 BIOS: + + + + + + + DSi firmware: + + + + + + + Browse... + + + + + + + + 0 + 0 + + + + <html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> + + + + + + + DSi NAND: + + + + + + + <html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html> + + + + + + + Browse... + + + + + + + + + + + CPU Emulation + + + + + + Enable JIT recompiler + + + + + + + Maximum JIT block size: + + + + + + + 1 + + + 32 + + + 32 + + + + + + + Branch Optimisations + + + + + + + Literal Optimisations + + + + + + + Fast Memory + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + @@ -275,6 +368,27 @@ + + tabWidget + cbxConsoleType + chkDirectBoot + txtBIOS9Path + txtBIOS7Path + txtFirmwarePath + txtDSiBIOS9Path + txtDSiBIOS7Path + txtDSiFirmwarePath + txtDSiNANDPath + btnBIOS9Browse + btnBIOS7Browse + btnFirmwareBrowse + btnDSiBIOS9Browse + btnDSiBIOS7Browse + btnDSiFirmwareBrowse + btnDSiNANDBrowse + chkEnableJIT + spnJITMaximumBlockSize + @@ -284,8 +398,8 @@ accept() - 248 - 254 + 257 + 349 157 @@ -300,8 +414,8 @@ reject() - 316 - 260 + 325 + 349 286 diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp index fa542ad..4557d0e 100644 --- a/src/frontend/qt_sdl/main.cpp +++ b/src/frontend/qt_sdl/main.cpp @@ -1641,7 +1641,14 @@ void MainWindow::onStop() void MainWindow::onOpenEmuSettings() { - EmuSettingsDialog::openDlg(this); + EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this); + connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished); +} + +void MainWindow::onEmuSettingsDialogFinished(int res) +{ + if (RunningSomething) + onReset(); } void MainWindow::onOpenInputConfig() diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h index 279aed8..eec2a48 100644 --- a/src/frontend/qt_sdl/main.h +++ b/src/frontend/qt_sdl/main.h @@ -199,6 +199,7 @@ private slots: void onStop(); void onOpenEmuSettings(); + void onEmuSettingsDialogFinished(int res); void onOpenInputConfig(); void onInputConfigFinished(int res); void onOpenVideoSettings(); diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp deleted file mode 100644 index 0df9c6c..0000000 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - Copyright 2016-2020 Arisotura - - This file is part of melonDS. - - melonDS is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation, either version 3 of the License, or (at your option) - any later version. - - melonDS is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with melonDS. If not, see http://www.gnu.org/licenses/. -*/ - -#include -#include - -#include "libui/ui.h" - -#include "../types.h" -#include "PlatformConfig.h" - -#include "DlgEmuSettings.h" - - -void ApplyNewSettings(int type); - -extern bool RunningSomething; - -namespace DlgEmuSettings -{ - -bool opened; -uiWindow* win; - -uiCheckbox* cbDirectBoot; - -#ifdef JIT_ENABLED -uiCheckbox* cbJITEnabled; -uiEntry* enJITMaxBlockSize; -uiCheckbox* cbJITBranchOptimisations; -uiCheckbox* cbJITLiteralOptimisations; -#endif - -int OnCloseWindow(uiWindow* window, void* blarg) -{ - opened = false; - return 1; -} - -void OnCancel(uiButton* btn, void* blarg) -{ - uiControlDestroy(uiControl(win)); - opened = false; -} - -void OnOk(uiButton* btn, void* blarg) -{ -#ifdef JIT_ENABLED - bool restart = false; - - bool enableJit = uiCheckboxChecked(cbJITEnabled); - char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); - long blockSize = strtol(maxBlockSizeStr, NULL, 10); - bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); - bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations); - uiFreeText(maxBlockSizeStr); - if (blockSize < 1) - blockSize = 1; - if (blockSize > 32) - blockSize = 32; - - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize - || branchOptimisations != Config::JIT_BrancheOptimisations - || literalOptimisations != Config::JIT_LiteralOptimisations) - { - if (RunningSomething && - !uiMsgBoxConfirm(win, "Reset emulator", - "Changing JIT settings requires a reset.\n\nDo you want to continue?")) - return; - - Config::JIT_Enable = enableJit; - Config::JIT_MaxBlockSize = blockSize; - Config::JIT_BrancheOptimisations = branchOptimisations; - Config::JIT_LiteralOptimisations = literalOptimisations; - - restart = true; - } -#endif - - Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); - - Config::Save(); - - uiControlDestroy(uiControl(win)); - opened = false; - -#ifdef JIT_ENABLED - if (restart) - ApplyNewSettings(4); -#endif -} - -#ifdef JIT_ENABLED -void OnJITStateChanged(uiCheckbox* cb, void* blarg) -{ - if (uiCheckboxChecked(cb)) - { - uiControlEnable(uiControl(enJITMaxBlockSize)); - uiControlEnable(uiControl(cbJITBranchOptimisations)); - uiControlEnable(uiControl(cbJITLiteralOptimisations)); - } - else - { - uiControlDisable(uiControl(enJITMaxBlockSize)); - uiControlDisable(uiControl(cbJITBranchOptimisations)); - uiControlDisable(uiControl(cbJITLiteralOptimisations)); - } -} -#endif - -void Open() -{ - if (opened) - { - uiControlSetFocus(uiControl(win)); - return; - } - - opened = true; - win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0); - uiWindowSetMargined(win, 1); - uiWindowOnClosing(win, OnCloseWindow, NULL); - - uiBox* top = uiNewVerticalBox(); - uiWindowSetChild(win, uiControl(top)); - - { - uiBox* in_ctrl = uiNewVerticalBox(); - uiBoxAppend(top, uiControl(in_ctrl), 0); - - cbDirectBoot = uiNewCheckbox("Boot game directly"); - uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0); - } - -#ifdef JIT_ENABLED - { - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(top, uiControl(dummy), 0); - } - - { - uiGroup* grp = uiNewGroup("JIT"); - uiBoxAppend(top, uiControl(grp), 1); - - uiBox* in_ctrl = uiNewVerticalBox(); - uiGroupSetChild(grp, uiControl(in_ctrl)); - - cbJITEnabled = uiNewCheckbox("Enable JIT recompiler"); - uiBoxAppend(in_ctrl, uiControl(cbJITEnabled), 0); - - uiCheckboxOnToggled(cbJITEnabled, OnJITStateChanged, NULL); - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - uiLabel* lbl = uiNewLabel("Maximum block size (1-32): "); - uiBoxAppend(row, uiControl(lbl), 0); - - enJITMaxBlockSize = uiNewEntry(); - uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:"); - uiBoxAppend(row, uiControl(lbl), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations"); - uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations"); - uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0); - } - } -#endif - - { - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(top, uiControl(dummy), 0); - } - - { - uiBox* in_ctrl = uiNewHorizontalBox(); - uiBoxSetPadded(in_ctrl, 1); - uiBoxAppend(top, uiControl(in_ctrl), 0); - - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(in_ctrl, uiControl(dummy), 1); - - uiButton* btncancel = uiNewButton("Cancel"); - uiButtonOnClicked(btncancel, OnCancel, NULL); - uiBoxAppend(in_ctrl, uiControl(btncancel), 0); - - uiButton* btnok = uiNewButton("Ok"); - uiButtonOnClicked(btnok, OnOk, NULL); - uiBoxAppend(in_ctrl, uiControl(btnok), 0); - } - - uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot); - -#ifdef JIT_ENABLED - uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable); - { - char maxBlockSizeStr[10]; - sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize); - uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); - } - OnJITStateChanged(cbJITEnabled, NULL); - - uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); - uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations); -#endif - - uiControlShow(uiControl(win)); -} - -void Close() -{ - if (!opened) return; - uiControlDestroy(uiControl(win)); - opened = false; -} - -} diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h deleted file mode 100644 index e45fe91..0000000 --- a/src/libui_sdl/libui/ui.h +++ /dev/null @@ -1,764 +0,0 @@ -// 6 april 2015 - -// TODO add a uiVerifyControlType() function that can be used by control implementations to verify controls - -#ifndef __LIBUI_UI_H__ -#define __LIBUI_UI_H__ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// this macro is generated by cmake -#ifdef libui_EXPORTS -#ifdef _WIN32 -#define _UI_EXTERN __declspec(dllexport) extern -#else -#define _UI_EXTERN __attribute__((visibility("default"))) extern -#endif -#else -// TODO add __declspec(dllimport) on windows, but only if not static -#define _UI_EXTERN extern -#endif - -// C++ is really really really really really really dumb about enums, so screw that and just make them anonymous -// This has the advantage of being ABI-able should we ever need an ABI... -#define _UI_ENUM(s) typedef unsigned int s; enum - -// This constant is provided because M_PI is nonstandard. -// This comes from Go's math.Pi, which in turn comes from http://oeis.org/A000796. -#define uiPi 3.14159265358979323846264338327950288419716939937510582097494459 - -// TODO uiBool? - -typedef struct uiInitOptions uiInitOptions; - -struct uiInitOptions { - size_t Size; -}; - -_UI_EXTERN const char *uiInit(uiInitOptions *options); -_UI_EXTERN void uiUninit(void); -_UI_EXTERN void uiFreeInitError(const char *err); - -_UI_EXTERN void uiMain(void); -_UI_EXTERN void uiMainSteps(void); -_UI_EXTERN int uiMainStep(int wait); -_UI_EXTERN void uiQuit(void); - -_UI_EXTERN void uiQueueMain(void (*f)(void *data), void *data); - -_UI_EXTERN void uiOnShouldQuit(int (*f)(void *data), void *data); - -_UI_EXTERN void uiFreeText(char *text); - -typedef struct uiControl uiControl; - -struct uiControl { - uint32_t Signature; - uint32_t OSSignature; - uint32_t TypeSignature; - void (*Destroy)(uiControl *); - uintptr_t (*Handle)(uiControl *); - uiControl *(*Parent)(uiControl *); - void (*SetParent)(uiControl *, uiControl *); - int (*Toplevel)(uiControl *); - int (*Visible)(uiControl *); - void (*Show)(uiControl *); - void (*Hide)(uiControl *); - int (*Enabled)(uiControl *); - void (*Enable)(uiControl *); - void (*Disable)(uiControl *); - void (*SetFocus)(uiControl *); - void (*SetMinSize)(uiControl*, int, int); - - int MinWidth, MinHeight; - - void* UserData; -}; -// TOOD add argument names to all arguments -#define uiControl(this) ((uiControl *) (this)) -_UI_EXTERN void uiControlDestroy(uiControl *); -_UI_EXTERN uintptr_t uiControlHandle(uiControl *); -_UI_EXTERN uiControl *uiControlParent(uiControl *); -_UI_EXTERN void uiControlSetParent(uiControl *, uiControl *); -_UI_EXTERN int uiControlToplevel(uiControl *); -_UI_EXTERN int uiControlVisible(uiControl *); -_UI_EXTERN void uiControlShow(uiControl *); -_UI_EXTERN void uiControlHide(uiControl *); -_UI_EXTERN int uiControlEnabled(uiControl *); -_UI_EXTERN void uiControlEnable(uiControl *); -_UI_EXTERN void uiControlDisable(uiControl *); -_UI_EXTERN void uiControlSetFocus(uiControl *); -_UI_EXTERN void uiControlSetMinSize(uiControl *, int w, int h); // -1 = no minimum - -_UI_EXTERN uiControl *uiAllocControl(size_t n, uint32_t OSsig, uint32_t typesig, const char *typenamestr); -_UI_EXTERN void uiFreeControl(uiControl *); - -// TODO make sure all controls have these -_UI_EXTERN void uiControlVerifySetParent(uiControl *, uiControl *); -_UI_EXTERN int uiControlEnabledToUser(uiControl *); - -_UI_EXTERN void uiUserBugCannotSetParentOnToplevel(const char *type); - -typedef struct uiWindow uiWindow; -#define uiWindow(this) ((uiWindow *) (this)) -_UI_EXTERN char *uiWindowTitle(uiWindow *w); -_UI_EXTERN void uiWindowSetTitle(uiWindow *w, const char *title); -_UI_EXTERN void uiWindowPosition(uiWindow *w, int *x, int *y); -_UI_EXTERN void uiWindowSetPosition(uiWindow *w, int x, int y); -_UI_EXTERN void uiWindowContentSize(uiWindow *w, int *width, int *height); -_UI_EXTERN void uiWindowSetContentSize(uiWindow *w, int width, int height); -_UI_EXTERN int uiWindowMinimized(uiWindow *w); -_UI_EXTERN void uiWindowSetMinimized(uiWindow *w, int minimized); -_UI_EXTERN int uiWindowMaximized(uiWindow *w); -_UI_EXTERN void uiWindowSetMaximized(uiWindow *w, int maximized); -_UI_EXTERN int uiWindowFullscreen(uiWindow *w); -_UI_EXTERN void uiWindowSetFullscreen(uiWindow *w, int fullscreen); -_UI_EXTERN int uiWindowBorderless(uiWindow *w); -_UI_EXTERN void uiWindowSetBorderless(uiWindow *w, int borderless); -_UI_EXTERN void uiWindowSetChild(uiWindow *w, uiControl *child); -_UI_EXTERN int uiWindowMargined(uiWindow *w); -_UI_EXTERN void uiWindowSetMargined(uiWindow *w, int margined); -_UI_EXTERN void uiWindowSetDropTarget(uiWindow* w, int drop); -_UI_EXTERN uiWindow *uiNewWindow(const char *title, int width, int height, int maximized, int hasMenubar, int resizable); - -_UI_EXTERN void uiWindowOnContentSizeChanged(uiWindow *w, void (*f)(uiWindow *, void *), void *data); -_UI_EXTERN void uiWindowOnClosing(uiWindow *w, int (*f)(uiWindow *w, void *data), void *data); -_UI_EXTERN void uiWindowOnDropFile(uiWindow *w, void (*f)(uiWindow *w, char *file, void *data), void *data); -_UI_EXTERN void uiWindowOnGetFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data); -_UI_EXTERN void uiWindowOnLoseFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data); - -typedef struct uiButton uiButton; -#define uiButton(this) ((uiButton *) (this)) -_UI_EXTERN char *uiButtonText(uiButton *b); -_UI_EXTERN void uiButtonSetText(uiButton *b, const char *text); -_UI_EXTERN void uiButtonOnClicked(uiButton *b, void (*f)(uiButton *b, void *data), void *data); -_UI_EXTERN uiButton *uiNewButton(const char *text); - -typedef struct uiBox uiBox; -#define uiBox(this) ((uiBox *) (this)) -_UI_EXTERN void uiBoxAppend(uiBox *b, uiControl *child, int stretchy); -_UI_EXTERN void uiBoxDelete(uiBox *b, int index); -_UI_EXTERN int uiBoxPadded(uiBox *b); -_UI_EXTERN void uiBoxSetPadded(uiBox *b, int padded); -_UI_EXTERN uiBox *uiNewHorizontalBox(void); -_UI_EXTERN uiBox *uiNewVerticalBox(void); - -typedef struct uiCheckbox uiCheckbox; -#define uiCheckbox(this) ((uiCheckbox *) (this)) -_UI_EXTERN char *uiCheckboxText(uiCheckbox *c); -_UI_EXTERN void uiCheckboxSetText(uiCheckbox *c, const char *text); -_UI_EXTERN void uiCheckboxOnToggled(uiCheckbox *c, void (*f)(uiCheckbox *c, void *data), void *data); -_UI_EXTERN int uiCheckboxChecked(uiCheckbox *c); -_UI_EXTERN void uiCheckboxSetChecked(uiCheckbox *c, int checked); -_UI_EXTERN uiCheckbox *uiNewCheckbox(const char *text); - -typedef struct uiEntry uiEntry; -#define uiEntry(this) ((uiEntry *) (this)) -_UI_EXTERN char *uiEntryText(uiEntry *e); -_UI_EXTERN void uiEntrySetText(uiEntry *e, const char *text); -_UI_EXTERN void uiEntryOnChanged(uiEntry *e, void (*f)(uiEntry *e, void *data), void *data); -_UI_EXTERN int uiEntryReadOnly(uiEntry *e); -_UI_EXTERN void uiEntrySetReadOnly(uiEntry *e, int readonly); -_UI_EXTERN uiEntry *uiNewEntry(void); -_UI_EXTERN uiEntry *uiNewPasswordEntry(void); -_UI_EXTERN uiEntry *uiNewSearchEntry(void); - -typedef struct uiLabel uiLabel; -#define uiLabel(this) ((uiLabel *) (this)) -_UI_EXTERN char *uiLabelText(uiLabel *l); -_UI_EXTERN void uiLabelSetText(uiLabel *l, const char *text); -_UI_EXTERN uiLabel *uiNewLabel(const char *text); - -typedef struct uiTab uiTab; -#define uiTab(this) ((uiTab *) (this)) -_UI_EXTERN void uiTabAppend(uiTab *t, const char *name, uiControl *c); -_UI_EXTERN void uiTabInsertAt(uiTab *t, const char *name, int before, uiControl *c); -_UI_EXTERN void uiTabDelete(uiTab *t, int index); -_UI_EXTERN int uiTabNumPages(uiTab *t); -_UI_EXTERN int uiTabMargined(uiTab *t, int page); -_UI_EXTERN void uiTabSetMargined(uiTab *t, int page, int margined); -_UI_EXTERN uiTab *uiNewTab(void); - -typedef struct uiGroup uiGroup; -#define uiGroup(this) ((uiGroup *) (this)) -_UI_EXTERN char *uiGroupTitle(uiGroup *g); -_UI_EXTERN void uiGroupSetTitle(uiGroup *g, const char *title); -_UI_EXTERN void uiGroupSetChild(uiGroup *g, uiControl *c); -_UI_EXTERN int uiGroupMargined(uiGroup *g); -_UI_EXTERN void uiGroupSetMargined(uiGroup *g, int margined); -_UI_EXTERN uiGroup *uiNewGroup(const char *title); - -// spinbox/slider rules: -// setting value outside of range will automatically clamp -// initial value is minimum -// complaint if min >= max? - -typedef struct uiSpinbox uiSpinbox; -#define uiSpinbox(this) ((uiSpinbox *) (this)) -_UI_EXTERN int uiSpinboxValue(uiSpinbox *s); -_UI_EXTERN void uiSpinboxSetValue(uiSpinbox *s, int value); -_UI_EXTERN void uiSpinboxOnChanged(uiSpinbox *s, void (*f)(uiSpinbox *s, void *data), void *data); -_UI_EXTERN uiSpinbox *uiNewSpinbox(int min, int max); - -typedef struct uiSlider uiSlider; -#define uiSlider(this) ((uiSlider *) (this)) -_UI_EXTERN int uiSliderValue(uiSlider *s); -_UI_EXTERN void uiSliderSetValue(uiSlider *s, int value); -_UI_EXTERN void uiSliderOnChanged(uiSlider *s, void (*f)(uiSlider *s, void *data), void *data); -_UI_EXTERN uiSlider *uiNewSlider(int min, int max); - -typedef struct uiProgressBar uiProgressBar; -#define uiProgressBar(this) ((uiProgressBar *) (this)) -_UI_EXTERN int uiProgressBarValue(uiProgressBar *p); -_UI_EXTERN void uiProgressBarSetValue(uiProgressBar *p, int n); -_UI_EXTERN uiProgressBar *uiNewProgressBar(void); - -typedef struct uiSeparator uiSeparator; -#define uiSeparator(this) ((uiSeparator *) (this)) -_UI_EXTERN uiSeparator *uiNewHorizontalSeparator(void); -_UI_EXTERN uiSeparator *uiNewVerticalSeparator(void); - -typedef struct uiCombobox uiCombobox; -#define uiCombobox(this) ((uiCombobox *) (this)) -_UI_EXTERN void uiComboboxAppend(uiCombobox *c, const char *text); -_UI_EXTERN int uiComboboxSelected(uiCombobox *c); -_UI_EXTERN void uiComboboxSetSelected(uiCombobox *c, int n); -_UI_EXTERN void uiComboboxOnSelected(uiCombobox *c, void (*f)(uiCombobox *c, void *data), void *data); -_UI_EXTERN uiCombobox *uiNewCombobox(void); - -typedef struct uiEditableCombobox uiEditableCombobox; -#define uiEditableCombobox(this) ((uiEditableCombobox *) (this)) -_UI_EXTERN void uiEditableComboboxAppend(uiEditableCombobox *c, const char *text); -_UI_EXTERN char *uiEditableComboboxText(uiEditableCombobox *c); -_UI_EXTERN void uiEditableComboboxSetText(uiEditableCombobox *c, const char *text); -// TODO what do we call a function that sets the currently selected item and fills the text field with it? editable comboboxes have no consistent concept of selected item -_UI_EXTERN void uiEditableComboboxOnChanged(uiEditableCombobox *c, void (*f)(uiEditableCombobox *c, void *data), void *data); -_UI_EXTERN uiEditableCombobox *uiNewEditableCombobox(void); - -typedef struct uiRadioButtons uiRadioButtons; -#define uiRadioButtons(this) ((uiRadioButtons *) (this)) -_UI_EXTERN void uiRadioButtonsAppend(uiRadioButtons *r, const char *text); -_UI_EXTERN int uiRadioButtonsSelected(uiRadioButtons *r); -_UI_EXTERN void uiRadioButtonsSetSelected(uiRadioButtons *r, int n); -_UI_EXTERN void uiRadioButtonsOnSelected(uiRadioButtons *r, void (*f)(uiRadioButtons *, void *), void *data); -_UI_EXTERN uiRadioButtons *uiNewRadioButtons(void); - -typedef struct uiDateTimePicker uiDateTimePicker; -#define uiDateTimePicker(this) ((uiDateTimePicker *) (this)) -_UI_EXTERN uiDateTimePicker *uiNewDateTimePicker(void); -_UI_EXTERN uiDateTimePicker *uiNewDatePicker(void); -_UI_EXTERN uiDateTimePicker *uiNewTimePicker(void); - -// TODO provide a facility for entering tab stops? -typedef struct uiMultilineEntry uiMultilineEntry; -#define uiMultilineEntry(this) ((uiMultilineEntry *) (this)) -_UI_EXTERN char *uiMultilineEntryText(uiMultilineEntry *e); -_UI_EXTERN void uiMultilineEntrySetText(uiMultilineEntry *e, const char *text); -_UI_EXTERN void uiMultilineEntryAppend(uiMultilineEntry *e, const char *text); -_UI_EXTERN void uiMultilineEntryOnChanged(uiMultilineEntry *e, void (*f)(uiMultilineEntry *e, void *data), void *data); -_UI_EXTERN int uiMultilineEntryReadOnly(uiMultilineEntry *e); -_UI_EXTERN void uiMultilineEntrySetReadOnly(uiMultilineEntry *e, int readonly); -_UI_EXTERN uiMultilineEntry *uiNewMultilineEntry(void); -_UI_EXTERN uiMultilineEntry *uiNewNonWrappingMultilineEntry(void); - -typedef struct uiMenuItem uiMenuItem; -#define uiMenuItem(this) ((uiMenuItem *) (this)) -_UI_EXTERN void uiMenuItemEnable(uiMenuItem *m); -_UI_EXTERN void uiMenuItemDisable(uiMenuItem *m); -_UI_EXTERN void uiMenuItemOnClicked(uiMenuItem *m, void (*f)(uiMenuItem *sender, uiWindow *window, void *data), void *data); -_UI_EXTERN int uiMenuItemChecked(uiMenuItem *m); -_UI_EXTERN void uiMenuItemSetChecked(uiMenuItem *m, int checked); - -typedef struct uiMenu uiMenu; -#define uiMenu(this) ((uiMenu *) (this)) -_UI_EXTERN uiMenuItem *uiMenuAppendItem(uiMenu *m, const char *name); -_UI_EXTERN uiMenuItem *uiMenuAppendCheckItem(uiMenu *m, const char *name); -_UI_EXTERN uiMenuItem *uiMenuAppendQuitItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendPreferencesItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendAboutItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendSubmenu(uiMenu *m, uiMenu* child); -_UI_EXTERN void uiMenuAppendSeparator(uiMenu *m); -_UI_EXTERN uiMenu *uiNewMenu(const char *name); - -_UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath); -_UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath); -_UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description); -_UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description); -_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description); - -typedef struct uiArea uiArea; -typedef struct uiAreaHandler uiAreaHandler; -typedef struct uiAreaDrawParams uiAreaDrawParams; -typedef struct uiAreaMouseEvent uiAreaMouseEvent; -typedef struct uiAreaKeyEvent uiAreaKeyEvent; - -typedef struct uiDrawContext uiDrawContext; - -// TO CONSIDER: the uiAreaHandler param there seems useless -// (might use individual callbacks instead of handler struct?) -struct uiAreaHandler { - void (*Draw)(uiAreaHandler *, uiArea *, uiAreaDrawParams *); - // TODO document that resizes cause a full redraw for non-scrolling areas; implementation-defined for scrolling areas - void (*MouseEvent)(uiAreaHandler *, uiArea *, uiAreaMouseEvent *); - // TODO document that on first show if the mouse is already in the uiArea then one gets sent with left=0 - // TODO what about when the area is hidden and then shown again? - void (*MouseCrossed)(uiAreaHandler *, uiArea *, int left); - void (*DragBroken)(uiAreaHandler *, uiArea *); - int (*KeyEvent)(uiAreaHandler *, uiArea *, uiAreaKeyEvent *); - void (*Resize)(uiAreaHandler *, uiArea *, int, int); -}; - -// TODO RTL layouts? -// TODO reconcile edge and corner naming -_UI_ENUM(uiWindowResizeEdge) { - uiWindowResizeEdgeLeft, - uiWindowResizeEdgeTop, - uiWindowResizeEdgeRight, - uiWindowResizeEdgeBottom, - uiWindowResizeEdgeTopLeft, - uiWindowResizeEdgeTopRight, - uiWindowResizeEdgeBottomLeft, - uiWindowResizeEdgeBottomRight, - // TODO have one for keyboard resizes? - // TODO GDK doesn't seem to have any others, including for keyboards... - // TODO way to bring up the system menu instead? -}; - -#define uiGLVersion(major, minor) ((major) | ((minor)<<16)) -#define uiGLVerMajor(ver) ((ver) & 0xFFFF) -#define uiGLVerMinor(ver) ((ver) >> 16) - -#define uiArea(this) ((uiArea *) (this)) -// TODO give a better name -// TODO document the types of width and height -_UI_EXTERN void uiAreaSetSize(uiArea *a, int width, int height); -// TODO uiAreaQueueRedraw() -_UI_EXTERN void uiAreaQueueRedrawAll(uiArea *a); -_UI_EXTERN void uiAreaScrollTo(uiArea *a, double x, double y, double width, double height); -// TODO document these can only be called within Mouse() handlers -// TODO should these be allowed on scrolling areas? -// TODO decide which mouse events should be accepted; Down is the only one guaranteed to work right now -// TODO what happens to events after calling this up to and including the next mouse up? -// TODO release capture? -_UI_EXTERN void uiAreaBeginUserWindowMove(uiArea *a); -_UI_EXTERN void uiAreaBeginUserWindowResize(uiArea *a, uiWindowResizeEdge edge); -_UI_EXTERN void uiAreaSetBackgroundColor(uiArea *a, int r, int g, int b); -_UI_EXTERN uiArea *uiNewArea(uiAreaHandler *ah); -_UI_EXTERN uiArea *uiNewGLArea(uiAreaHandler *ah, const unsigned int* req_versions); -_UI_EXTERN uiArea *uiNewScrollingArea(uiAreaHandler *ah, int width, int height); - -struct uiAreaDrawParams { - uiDrawContext *Context; - - // TODO document that this is only defined for nonscrolling areas - double AreaWidth; - double AreaHeight; - - double ClipX; - double ClipY; - double ClipWidth; - double ClipHeight; -}; - -typedef struct uiDrawPath uiDrawPath; -typedef struct uiDrawBrush uiDrawBrush; -typedef struct uiDrawStrokeParams uiDrawStrokeParams; -typedef struct uiDrawMatrix uiDrawMatrix; - -typedef struct uiDrawBrushGradientStop uiDrawBrushGradientStop; - -typedef struct uiDrawBitmap uiDrawBitmap; - -_UI_ENUM(uiDrawBrushType) { - uiDrawBrushTypeSolid, - uiDrawBrushTypeLinearGradient, - uiDrawBrushTypeRadialGradient, - uiDrawBrushTypeImage, -}; - -_UI_ENUM(uiDrawLineCap) { - uiDrawLineCapFlat, - uiDrawLineCapRound, - uiDrawLineCapSquare, -}; - -_UI_ENUM(uiDrawLineJoin) { - uiDrawLineJoinMiter, - uiDrawLineJoinRound, - uiDrawLineJoinBevel, -}; - -// this is the default for botoh cairo and Direct2D (in the latter case, from the C++ helper functions) -// Core Graphics doesn't explicitly specify a default, but NSBezierPath allows you to choose one, and this is the initial value -// so we're good to use it too! -#define uiDrawDefaultMiterLimit 10.0 - -_UI_ENUM(uiDrawFillMode) { - uiDrawFillModeWinding, - uiDrawFillModeAlternate, -}; - -struct uiDrawMatrix { - double M11; - double M12; - double M21; - double M22; - double M31; - double M32; -}; - -struct uiDrawBrush { - uiDrawBrushType Type; - - // solid brushes - double R; - double G; - double B; - double A; - - // gradient brushes - double X0; // linear: start X, radial: start X - double Y0; // linear: start Y, radial: start Y - double X1; // linear: end X, radial: outer circle center X - double Y1; // linear: end Y, radial: outer circle center Y - double OuterRadius; // radial gradients only - uiDrawBrushGradientStop *Stops; - size_t NumStops; - // TODO extend mode - // cairo: none, repeat, reflect, pad; no individual control - // Direct2D: repeat, reflect, pad; no individual control - // Core Graphics: none, pad; before and after individually - // TODO cairo documentation is inconsistent about pad - - // TODO images - - // TODO transforms -}; - -struct uiDrawBrushGradientStop { - double Pos; - double R; - double G; - double B; - double A; -}; - -struct uiDrawStrokeParams { - uiDrawLineCap Cap; - uiDrawLineJoin Join; - // TODO what if this is 0? on windows there will be a crash with dashing - double Thickness; - double MiterLimit; - double *Dashes; - // TOOD what if this is 1 on Direct2D? - // TODO what if a dash is 0 on Cairo or Quartz? - size_t NumDashes; - double DashPhase; -}; - -struct uiRect { - int X; - int Y; - int Width; - int Height; -}; - -typedef struct uiRect uiRect; - -_UI_EXTERN uiDrawPath *uiDrawNewPath(uiDrawFillMode fillMode); -_UI_EXTERN void uiDrawFreePath(uiDrawPath *p); - -_UI_EXTERN void uiDrawPathNewFigure(uiDrawPath *p, double x, double y); -_UI_EXTERN void uiDrawPathNewFigureWithArc(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative); -_UI_EXTERN void uiDrawPathLineTo(uiDrawPath *p, double x, double y); -// notes: angles are both relative to 0 and go counterclockwise -// TODO is the initial line segment on cairo and OS X a proper join? -// TODO what if sweep < 0? -_UI_EXTERN void uiDrawPathArcTo(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative); -_UI_EXTERN void uiDrawPathBezierTo(uiDrawPath *p, double c1x, double c1y, double c2x, double c2y, double endX, double endY); -// TODO quadratic bezier -_UI_EXTERN void uiDrawPathCloseFigure(uiDrawPath *p); - -// TODO effect of these when a figure is already started -_UI_EXTERN void uiDrawPathAddRectangle(uiDrawPath *p, double x, double y, double width, double height); - -_UI_EXTERN void uiDrawPathEnd(uiDrawPath *p); - -_UI_EXTERN void uiDrawStroke(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b, uiDrawStrokeParams *p); -_UI_EXTERN void uiDrawFill(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b); - -// TODO primitives: -// - rounded rectangles -// - elliptical arcs -// - quadratic bezier curves - -_UI_EXTERN void uiDrawMatrixSetIdentity(uiDrawMatrix *m); -_UI_EXTERN void uiDrawMatrixTranslate(uiDrawMatrix *m, double x, double y); -_UI_EXTERN void uiDrawMatrixScale(uiDrawMatrix *m, double xCenter, double yCenter, double x, double y); -_UI_EXTERN void uiDrawMatrixRotate(uiDrawMatrix *m, double x, double y, double amount); -_UI_EXTERN void uiDrawMatrixSkew(uiDrawMatrix *m, double x, double y, double xamount, double yamount); -_UI_EXTERN void uiDrawMatrixMultiply(uiDrawMatrix *dest, uiDrawMatrix *src); -_UI_EXTERN int uiDrawMatrixInvertible(uiDrawMatrix *m); -_UI_EXTERN int uiDrawMatrixInvert(uiDrawMatrix *m); -_UI_EXTERN void uiDrawMatrixTransformPoint(uiDrawMatrix *m, double *x, double *y); -_UI_EXTERN void uiDrawMatrixTransformSize(uiDrawMatrix *m, double *x, double *y); - -_UI_EXTERN void uiDrawTransform(uiDrawContext *c, uiDrawMatrix *m); - -// TODO add a uiDrawPathStrokeToFill() or something like that -_UI_EXTERN void uiDrawClip(uiDrawContext *c, uiDrawPath *path); - -_UI_EXTERN void uiDrawSave(uiDrawContext *c); -_UI_EXTERN void uiDrawRestore(uiDrawContext *c); - -// bitmap API -_UI_EXTERN uiDrawBitmap* uiDrawNewBitmap(uiDrawContext* c, int width, int height, int alpha); -_UI_EXTERN void uiDrawBitmapUpdate(uiDrawBitmap* bmp, const void* data); -_UI_EXTERN void uiDrawBitmapDraw(uiDrawContext* c, uiDrawBitmap* bmp, uiRect* srcrect, uiRect* dstrect, int filter); -_UI_EXTERN void uiDrawFreeBitmap(uiDrawBitmap* bmp); - -// TODO manage the use of Text, Font, and TextFont, and of the uiDrawText prefix in general - -///// TODO reconsider this -typedef struct uiDrawFontFamilies uiDrawFontFamilies; - -_UI_EXTERN uiDrawFontFamilies *uiDrawListFontFamilies(void); -_UI_EXTERN int uiDrawFontFamiliesNumFamilies(uiDrawFontFamilies *ff); -_UI_EXTERN char *uiDrawFontFamiliesFamily(uiDrawFontFamilies *ff, int n); -_UI_EXTERN void uiDrawFreeFontFamilies(uiDrawFontFamilies *ff); -///// END TODO - -typedef struct uiDrawTextLayout uiDrawTextLayout; -typedef struct uiDrawTextFont uiDrawTextFont; -typedef struct uiDrawTextFontDescriptor uiDrawTextFontDescriptor; -typedef struct uiDrawTextFontMetrics uiDrawTextFontMetrics; - -_UI_ENUM(uiDrawTextWeight) { - uiDrawTextWeightThin, - uiDrawTextWeightUltraLight, - uiDrawTextWeightLight, - uiDrawTextWeightBook, - uiDrawTextWeightNormal, - uiDrawTextWeightMedium, - uiDrawTextWeightSemiBold, - uiDrawTextWeightBold, - uiDrawTextWeightUltraBold, - uiDrawTextWeightHeavy, - uiDrawTextWeightUltraHeavy, -}; - -_UI_ENUM(uiDrawTextItalic) { - uiDrawTextItalicNormal, - uiDrawTextItalicOblique, - uiDrawTextItalicItalic, -}; - -_UI_ENUM(uiDrawTextStretch) { - uiDrawTextStretchUltraCondensed, - uiDrawTextStretchExtraCondensed, - uiDrawTextStretchCondensed, - uiDrawTextStretchSemiCondensed, - uiDrawTextStretchNormal, - uiDrawTextStretchSemiExpanded, - uiDrawTextStretchExpanded, - uiDrawTextStretchExtraExpanded, - uiDrawTextStretchUltraExpanded, -}; - -struct uiDrawTextFontDescriptor { - const char *Family; - double Size; - uiDrawTextWeight Weight; - uiDrawTextItalic Italic; - uiDrawTextStretch Stretch; -}; - -struct uiDrawTextFontMetrics { - double Ascent; - double Descent; - double Leading; - // TODO do these two mean the same across all platforms? - double UnderlinePos; - double UnderlineThickness; -}; - -_UI_EXTERN uiDrawTextFont *uiDrawLoadClosestFont(const uiDrawTextFontDescriptor *desc); -_UI_EXTERN void uiDrawFreeTextFont(uiDrawTextFont *font); -_UI_EXTERN uintptr_t uiDrawTextFontHandle(uiDrawTextFont *font); -_UI_EXTERN void uiDrawTextFontDescribe(uiDrawTextFont *font, uiDrawTextFontDescriptor *desc); -// TODO make copy with given attributes methods? -// TODO yuck this name -_UI_EXTERN void uiDrawTextFontGetMetrics(uiDrawTextFont *font, uiDrawTextFontMetrics *metrics); - -// TODO initial line spacing? and what about leading? -_UI_EXTERN uiDrawTextLayout *uiDrawNewTextLayout(const char *text, uiDrawTextFont *defaultFont, double width); -_UI_EXTERN void uiDrawFreeTextLayout(uiDrawTextLayout *layout); -// TODO get width -_UI_EXTERN void uiDrawTextLayoutSetWidth(uiDrawTextLayout *layout, double width); -_UI_EXTERN void uiDrawTextLayoutExtents(uiDrawTextLayout *layout, double *width, double *height); - -// and the attributes that you can set on a text layout -_UI_EXTERN void uiDrawTextLayoutSetColor(uiDrawTextLayout *layout, int startChar, int endChar, double r, double g, double b, double a); - -_UI_EXTERN void uiDrawText(uiDrawContext *c, double x, double y, uiDrawTextLayout *layout); - - -// OpenGL support - -typedef struct uiGLContext uiGLContext; - -_UI_EXTERN uiGLContext *uiAreaGetGLContext(uiArea* a); -_UI_EXTERN void uiGLMakeContextCurrent(uiGLContext* ctx); -_UI_EXTERN void uiGLBegin(uiGLContext* ctx); -_UI_EXTERN void uiGLEnd(uiGLContext* ctx); -_UI_EXTERN unsigned int uiGLGetVersion(uiGLContext* ctx); -_UI_EXTERN void *uiGLGetProcAddress(const char* proc); -_UI_EXTERN int uiGLGetFramebuffer(uiGLContext* ctx); -_UI_EXTERN float uiGLGetFramebufferScale(uiGLContext* ctx); -_UI_EXTERN void uiGLSwapBuffers(uiGLContext* ctx); -_UI_EXTERN void uiGLSetVSync(int sync); - - -_UI_ENUM(uiModifiers) { - uiModifierCtrl = 1 << 0, - uiModifierAlt = 1 << 1, - uiModifierShift = 1 << 2, - uiModifierSuper = 1 << 3, -}; - -// TODO document drag captures -struct uiAreaMouseEvent { - // TODO document what these mean for scrolling areas - double X; - double Y; - - // TODO see draw above - double AreaWidth; - double AreaHeight; - - int Down; - int Up; - - int Count; - - uiModifiers Modifiers; - - uint64_t Held1To64; -}; - -_UI_ENUM(uiExtKey) { - uiExtKeyEscape = 1, - uiExtKeyInsert, // equivalent to "Help" on Apple keyboards - uiExtKeyDelete, - uiExtKeyHome, - uiExtKeyEnd, - uiExtKeyPageUp, - uiExtKeyPageDown, - uiExtKeyUp, - uiExtKeyDown, - uiExtKeyLeft, - uiExtKeyRight, - uiExtKeyF1, // F1..F12 are guaranteed to be consecutive - uiExtKeyF2, - uiExtKeyF3, - uiExtKeyF4, - uiExtKeyF5, - uiExtKeyF6, - uiExtKeyF7, - uiExtKeyF8, - uiExtKeyF9, - uiExtKeyF10, - uiExtKeyF11, - uiExtKeyF12, - uiExtKeyN0, // numpad keys; independent of Num Lock state - uiExtKeyN1, // N0..N9 are guaranteed to be consecutive - uiExtKeyN2, - uiExtKeyN3, - uiExtKeyN4, - uiExtKeyN5, - uiExtKeyN6, - uiExtKeyN7, - uiExtKeyN8, - uiExtKeyN9, - uiExtKeyNDot, - uiExtKeyNEnter, - uiExtKeyNAdd, - uiExtKeyNSubtract, - uiExtKeyNMultiply, - uiExtKeyNDivide, -}; - -struct uiAreaKeyEvent { - char Key; - uiExtKey ExtKey; - uiModifiers Modifier; - - uiModifiers Modifiers; - - // additional things - int Scancode; // bit0-7: scancode, bit8: ext flag - - int Up; - int Repeat; -}; - -typedef struct uiFontButton uiFontButton; -#define uiFontButton(this) ((uiFontButton *) (this)) -// TODO document this returns a new font -_UI_EXTERN uiDrawTextFont *uiFontButtonFont(uiFontButton *b); -// TOOD SetFont, mechanics -_UI_EXTERN void uiFontButtonOnChanged(uiFontButton *b, void (*f)(uiFontButton *, void *), void *data); -_UI_EXTERN uiFontButton *uiNewFontButton(void); - -typedef struct uiColorButton uiColorButton; -#define uiColorButton(this) ((uiColorButton *) (this)) -_UI_EXTERN void uiColorButtonColor(uiColorButton *b, double *r, double *g, double *bl, double *a); -_UI_EXTERN void uiColorButtonSetColor(uiColorButton *b, double r, double g, double bl, double a); -_UI_EXTERN void uiColorButtonOnChanged(uiColorButton *b, void (*f)(uiColorButton *, void *), void *data); -_UI_EXTERN uiColorButton *uiNewColorButton(void); - -typedef struct uiForm uiForm; -#define uiForm(this) ((uiForm *) (this)) -_UI_EXTERN void uiFormAppend(uiForm *f, const char *label, uiControl *c, int stretchy); -_UI_EXTERN void uiFormDelete(uiForm *f, int index); -_UI_EXTERN int uiFormPadded(uiForm *f); -_UI_EXTERN void uiFormSetPadded(uiForm *f, int padded); -_UI_EXTERN uiForm *uiNewForm(void); - -_UI_ENUM(uiAlign) { - uiAlignFill, - uiAlignStart, - uiAlignCenter, - uiAlignEnd, -}; - -_UI_ENUM(uiAt) { - uiAtLeading, - uiAtTop, - uiAtTrailing, - uiAtBottom, -}; - -typedef struct uiGrid uiGrid; -#define uiGrid(this) ((uiGrid *) (this)) -_UI_EXTERN void uiGridAppend(uiGrid *g, uiControl *c, int left, int top, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign); -_UI_EXTERN void uiGridInsertAt(uiGrid *g, uiControl *c, uiControl *existing, uiAt at, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign); -_UI_EXTERN int uiGridPadded(uiGrid *g); -_UI_EXTERN void uiGridSetPadded(uiGrid *g, int padded); -_UI_EXTERN uiGrid *uiNewGrid(void); - - -// misc. - -_UI_EXTERN char* uiKeyName(int scancode); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/libui_sdl/libui/unix/stddialogs.c b/src/libui_sdl/libui/unix/stddialogs.c deleted file mode 100644 index 10c598d..0000000 --- a/src/libui_sdl/libui/unix/stddialogs.c +++ /dev/null @@ -1,126 +0,0 @@ -// 26 june 2015 -#include "uipriv_unix.h" - -// LONGTERM figure out why, and describe, that this is the desired behavior -// LONGTERM also point out that font and color buttons also work like this - -#define windowWindow(w) ((w)?(GTK_WINDOW(uiControlHandle(uiControl(w)))):NULL) - -static char *filedialog(GtkWindow *parent, GtkFileChooserAction mode, const gchar *confirm, const char* filter, const char* initpath) -{ - GtkWidget *fcd; - GtkFileChooser *fc; - gint response; - char *filename; - - fcd = gtk_file_chooser_dialog_new(NULL, parent, mode, - "_Cancel", GTK_RESPONSE_CANCEL, - confirm, GTK_RESPONSE_ACCEPT, - NULL); - fc = GTK_FILE_CHOOSER(fcd); - - // filters - { - gchar _filter[256]; - gchar* fp = &_filter[0]; int s = 0; - gchar* fname; - for (int i = 0; i < 255; i++) - { - if (filter[i] == '|' || filter[i] == '\0') - { - _filter[i] = '\0'; - if (s & 1) - { - GtkFileFilter* filter = gtk_file_filter_new(); - gtk_file_filter_set_name(filter, fname); - - for (gchar* j = fp; ; j++) - { - if (*j == ';') - { - *j = '\0'; - gtk_file_filter_add_pattern(filter, fp); - fp = j+1; - } - else if (*j == '\0') - { - gtk_file_filter_add_pattern(filter, fp); - break; - } - } - - gtk_file_chooser_add_filter(fc, filter); - } - else - { - fname = fp; - } - fp = &_filter[i+1]; - s++; - if (s >= 8) break; - if (filter[i] == '\0') break; - } - else - _filter[i] = filter[i]; - } - } - - gtk_file_chooser_set_local_only(fc, FALSE); - gtk_file_chooser_set_select_multiple(fc, FALSE); - gtk_file_chooser_set_show_hidden(fc, TRUE); - gtk_file_chooser_set_do_overwrite_confirmation(fc, TRUE); - gtk_file_chooser_set_create_folders(fc, TRUE); - if (initpath && strlen(initpath)>0) - gtk_file_chooser_set_current_folder(fc, initpath); - - response = gtk_dialog_run(GTK_DIALOG(fcd)); - if (response != GTK_RESPONSE_ACCEPT) { - gtk_widget_destroy(fcd); - return NULL; - } - filename = uiUnixStrdupText(gtk_file_chooser_get_filename(fc)); - gtk_widget_destroy(fcd); - return filename; -} - -char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath) -{ - return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_OPEN, "_Open", filter, initpath); -} - -char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath) -{ - return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_SAVE, "_Save", filter, initpath); -} - -static int msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons) -{ - GtkWidget *md; - - md = gtk_message_dialog_new(parent, GTK_DIALOG_MODAL, - type, buttons, - "%s", title); - gtk_message_dialog_format_secondary_text(GTK_MESSAGE_DIALOG(md), "%s", description); - int result = gtk_dialog_run(GTK_DIALOG(md)); - gtk_widget_destroy(md); - - return result; -} - -void uiMsgBox(uiWindow *parent, const char *title, const char *description) -{ - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_OTHER, GTK_BUTTONS_OK); -} - -void uiMsgBoxError(uiWindow *parent, const char *title, const char *description) -{ - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_ERROR, GTK_BUTTONS_OK); -} - -int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description) -{ - int result = - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_QUESTION, GTK_BUTTONS_OK_CANCEL); - - return result == GTK_RESPONSE_OK; -} \ No newline at end of file diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp deleted file mode 100644 index 7537015..0000000 --- a/src/libui_sdl/libui/windows/stddialogs.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// 22 may 2015 -#include "uipriv_windows.hpp" - -// TODO document all this is what we want -// TODO do the same for font and color buttons - -// notes: -// - FOS_SUPPORTSTREAMABLEITEMS doesn't seem to be supported on windows vista, or at least not with the flags we use -// - even with FOS_NOVALIDATE the dialogs will reject invalid filenames (at least on Vista, anyway) -// - lack of FOS_NOREADONLYRETURN doesn't seem to matter on Windows 7 - -// TODO -// - http://blogs.msdn.com/b/wpfsdk/archive/2006/10/26/uncommon-dialogs--font-chooser-and-color-picker-dialogs.aspx -// - when a dialog is active, tab navigation in other windows stops working -// - when adding uiOpenFolder(), use IFileDialog as well - https://msdn.microsoft.com/en-us/library/windows/desktop/bb762115%28v=vs.85%29.aspx - -#define windowHWND(w) (w ? (HWND) uiControlHandle(uiControl(w)) : NULL) - -char *commonItemDialog(HWND parent, REFCLSID clsid, REFIID iid, const char* filter, const char* initpath, FILEOPENDIALOGOPTIONS optsadd) -{ - IFileDialog *d = NULL; - FILEOPENDIALOGOPTIONS opts; - IShellItem *result = NULL; - WCHAR *wname = NULL; - char *name = NULL; - HRESULT hr; - - hr = CoCreateInstance(clsid, - NULL, CLSCTX_INPROC_SERVER, - iid, (LPVOID *) (&d)); - if (hr != S_OK) { - logHRESULT(L"error creating common item dialog", hr); - // always return NULL on error - goto out; - } - hr = d->GetOptions(&opts); - if (hr != S_OK) { - logHRESULT(L"error getting current options", hr); - goto out; - } - opts |= optsadd; - // the other platforms don't check read-only; we won't either - opts &= ~FOS_NOREADONLYRETURN; - hr = d->SetOptions(opts); - if (hr != S_OK) { - logHRESULT(L"error setting options", hr); - goto out; - } - - // filters - { - COMDLG_FILTERSPEC filterspec[8]; - wchar_t _filter[256]; - wchar_t* fp = &_filter[0]; int s = 0; - wchar_t* fname; - for (int i = 0; i < 255; i++) - { - if (filter[i] == '|' || filter[i] == '\0') - { - _filter[i] = '\0'; - if (s & 1) - { - filterspec[s>>1].pszName = fname; - filterspec[s>>1].pszSpec = fp; - } - else - { - fname = fp; - } - fp = &_filter[i+1]; - s++; - if (s >= 8) break; - if (filter[i] == '\0') break; - } - else - _filter[i] = filter[i]; - } - d->SetFileTypes(s>>1, filterspec); - } - - hr = d->Show(parent); - if (hr == HRESULT_FROM_WIN32(ERROR_CANCELLED)) - // cancelled; return NULL like we have ready - goto out; - if (hr != S_OK) { - logHRESULT(L"error showing dialog", hr); - goto out; - } - hr = d->GetResult(&result); - if (hr != S_OK) { - logHRESULT(L"error getting dialog result", hr); - goto out; - } - hr = result->GetDisplayName(SIGDN_FILESYSPATH, &wname); - if (hr != S_OK) { - logHRESULT(L"error getting filename", hr); - goto out; - } - name = toUTF8(wname); - -out: - if (wname != NULL) - CoTaskMemFree(wname); - if (result != NULL) - result->Release(); - if (d != NULL) - d->Release(); - return name; -} - -char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath) -{ - char *res; - - disableAllWindowsExcept(parent); - res = commonItemDialog(windowHWND(parent), - CLSID_FileOpenDialog, IID_IFileOpenDialog, - filter, initpath, - FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_PATHMUSTEXIST | FOS_FILEMUSTEXIST | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE); - enableAllWindowsExcept(parent); - return res; -} - -char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath) -{ - char *res; - - disableAllWindowsExcept(parent); - res = commonItemDialog(windowHWND(parent), - CLSID_FileSaveDialog, IID_IFileSaveDialog, - filter, initpath, - FOS_OVERWRITEPROMPT | FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE); - enableAllWindowsExcept(parent); - return res; -} - -// TODO switch to TaskDialogIndirect()? - -static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon) -{ - WCHAR *wtitle, *wdescription; - HRESULT hr; - - wtitle = toUTF16(title); - wdescription = toUTF16(description); - - int result; - hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result); - if (hr != S_OK) - logHRESULT(L"error showing task dialog", hr); - - uiFree(wdescription); - uiFree(wtitle); - - return result; -} - -void uiMsgBox(uiWindow *parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, NULL); - enableAllWindowsExcept(parent); -} - -void uiMsgBoxError(uiWindow *parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON); - enableAllWindowsExcept(parent); -} - -int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - int result = - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON); - enableAllWindowsExcept(parent); - - return result == IDOK; -} \ No newline at end of file diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp deleted file mode 100644 index 0066668..0000000 --- a/src/libui_sdl/main.cpp +++ /dev/null @@ -1,3061 +0,0 @@ -/* - Copyright 2016-2020 Arisotura - - This file is part of melonDS. - - melonDS is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation, either version 3 of the License, or (at your option) - any later version. - - melonDS is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with melonDS. If not, see http://www.gnu.org/licenses/. -*/ - -#include -#include -#include -#include - -#ifndef __WIN32__ -#include -#endif - -#include -#include "libui/ui.h" - -#include "../OpenGLSupport.h" -#include "main_shaders.h" - -#include "../types.h" -#include "../version.h" -#include "PlatformConfig.h" - -#include "DlgEmuSettings.h" -#include "DlgInputConfig.h" -#include "DlgVideoSettings.h" -#include "DlgAudioSettings.h" -#include "DlgWifiSettings.h" - -#include "../NDS.h" -#include "../GBACart.h" -#include "../GPU.h" -#include "../SPU.h" -#include "../Wifi.h" -#include "../Platform.h" -#include "../Config.h" -#include "../ARMJIT.h" - -#include "../Savestate.h" - -#include "OSD.h" - -#ifdef MELONCAP -#include "MelonCap.h" -#endif // MELONCAP - - -// savestate slot mapping -// 1-8: regular slots (quick access) -// '9': load/save arbitrary file -const int kSavestateNum[9] = {1, 2, 3, 4, 5, 6, 7, 8, 0}; - -const int kScreenSize[4] = {1, 2, 3, 4}; -const int kScreenRot[4] = {0, 1, 2, 3}; -const int kScreenGap[6] = {0, 1, 8, 64, 90, 128}; -const int kScreenLayout[3] = {0, 1, 2}; -const int kScreenSizing[4] = {0, 1, 2, 3}; - - -char* EmuDirectory; - - -uiWindow* MainWindow; -uiArea* MainDrawArea; -uiAreaHandler MainDrawAreaHandler; - -const u32 kGLVersions[] = {uiGLVersion(3,2), uiGLVersion(3,1), 0}; -uiGLContext* GLContext; - -int WindowWidth, WindowHeight; - -uiMenuItem* MenuItem_SaveState; -uiMenuItem* MenuItem_LoadState; -uiMenuItem* MenuItem_UndoStateLoad; - -uiMenuItem* MenuItem_SaveStateSlot[9]; -uiMenuItem* MenuItem_LoadStateSlot[9]; - -uiMenuItem* MenuItem_Pause; -uiMenuItem* MenuItem_Reset; -uiMenuItem* MenuItem_Stop; - -uiMenuItem* MenuItem_SavestateSRAMReloc; - -uiMenuItem* MenuItem_ScreenRot[4]; -uiMenuItem* MenuItem_ScreenGap[6]; -uiMenuItem* MenuItem_ScreenLayout[3]; -uiMenuItem* MenuItem_ScreenSizing[4]; - -uiMenuItem* MenuItem_ScreenFilter; -uiMenuItem* MenuItem_LimitFPS; -uiMenuItem* MenuItem_AudioSync; -uiMenuItem* MenuItem_ShowOSD; - -SDL_Thread* EmuThread; -int EmuRunning; -volatile int EmuStatus; - -bool RunningSomething; -char ROMPath[2][1024]; -char SRAMPath[2][1024]; -char PrevSRAMPath[2][1024]; // for savestate 'undo load' - -bool SavestateLoaded; - -bool Screen_UseGL; - -bool ScreenDrawInited = false; -uiDrawBitmap* ScreenBitmap[2] = {NULL,NULL}; - -GLuint GL_ScreenShader[3]; -GLuint GL_ScreenShaderAccel[3]; -GLuint GL_ScreenShaderOSD[3]; -struct -{ - float uScreenSize[2]; - u32 u3DScale; - u32 uFilterMode; - -} GL_ShaderConfig; -GLuint GL_ShaderConfigUBO; -GLuint GL_ScreenVertexArrayID, GL_ScreenVertexBufferID; -float GL_ScreenVertices[2 * 3*2 * 4]; // position/texcoord -GLuint GL_ScreenTexture; -bool GL_ScreenSizeDirty; - -int GL_3DScale; - -bool GL_VSyncStatus; - -int ScreenGap = 0; -int ScreenLayout = 0; -int ScreenSizing = 0; -int ScreenRotation = 0; - -int MainScreenPos[3]; -int AutoScreenSizing; - -uiRect TopScreenRect; -uiRect BottomScreenRect; -uiDrawMatrix TopScreenTrans; -uiDrawMatrix BottomScreenTrans; - -bool Touching = false; - -u32 KeyInputMask, JoyInputMask; -u32 KeyHotkeyMask, JoyHotkeyMask; -u32 HotkeyMask, LastHotkeyMask; -u32 HotkeyPress, HotkeyRelease; - -#define HotkeyDown(hk) (HotkeyMask & (1<<(hk))) -#define HotkeyPressed(hk) (HotkeyPress & (1<<(hk))) -#define HotkeyReleased(hk) (HotkeyRelease & (1<<(hk))) - -bool LidStatus; - -int JoystickID; -SDL_Joystick* Joystick; - -int AudioFreq; -float AudioSampleFrac; -SDL_AudioDeviceID AudioDevice, MicDevice; - -SDL_cond* AudioSync; -SDL_mutex* AudioSyncLock; - -u32 MicBufferLength = 2048; -s16 MicBuffer[2048]; -u32 MicBufferReadPos, MicBufferWritePos; - -u32 MicWavLength; -s16* MicWavBuffer; - -void SetupScreenRects(int width, int height); - -void TogglePause(void* blarg); -void Reset(void* blarg); - -void SetupSRAMPath(int slot); - -void SaveState(int slot); -void LoadState(int slot); -void UndoStateLoad(); -void GetSavestateName(int slot, char* filename, int len); - -void CreateMainWindow(bool opengl); -void DestroyMainWindow(); -void RecreateMainWindow(bool opengl); - - - -bool GLScreen_InitShader(GLuint* shader, const char* fs) -{ - if (!OpenGL_BuildShaderProgram(kScreenVS, fs, shader, "ScreenShader")) - return false; - - glBindAttribLocation(shader[2], 0, "vPosition"); - glBindAttribLocation(shader[2], 1, "vTexcoord"); - glBindFragDataLocation(shader[2], 0, "oColor"); - - if (!OpenGL_LinkShaderProgram(shader)) - return false; - - GLuint uni_id; - - uni_id = glGetUniformBlockIndex(shader[2], "uConfig"); - glUniformBlockBinding(shader[2], uni_id, 16); - - glUseProgram(shader[2]); - uni_id = glGetUniformLocation(shader[2], "ScreenTex"); - glUniform1i(uni_id, 0); - uni_id = glGetUniformLocation(shader[2], "_3DTex"); - glUniform1i(uni_id, 1); - - return true; -} - -bool GLScreen_InitOSDShader(GLuint* shader) -{ - if (!OpenGL_BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, shader, "ScreenShaderOSD")) - return false; - - glBindAttribLocation(shader[2], 0, "vPosition"); - glBindFragDataLocation(shader[2], 0, "oColor"); - - if (!OpenGL_LinkShaderProgram(shader)) - return false; - - GLuint uni_id; - - uni_id = glGetUniformBlockIndex(shader[2], "uConfig"); - glUniformBlockBinding(shader[2], uni_id, 16); - - glUseProgram(shader[2]); - uni_id = glGetUniformLocation(shader[2], "OSDTex"); - glUniform1i(uni_id, 0); - - return true; -} - -bool GLScreen_Init() -{ - GL_VSyncStatus = Config::ScreenVSync; - - // TODO: consider using epoxy? - if (!OpenGL_Init()) - return false; - - const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string - const GLubyte* version = glGetString(GL_VERSION); // version as a string - printf("OpenGL: renderer: %s\n", renderer); - printf("OpenGL: version: %s\n", version); - - if (!GLScreen_InitShader(GL_ScreenShader, kScreenFS)) - return false; - if (!GLScreen_InitShader(GL_ScreenShaderAccel, kScreenFS_Accel)) - return false; - if (!GLScreen_InitOSDShader(GL_ScreenShaderOSD)) - return false; - - memset(&GL_ShaderConfig, 0, sizeof(GL_ShaderConfig)); - - glGenBuffers(1, &GL_ShaderConfigUBO); - glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO); - glBufferData(GL_UNIFORM_BUFFER, sizeof(GL_ShaderConfig), &GL_ShaderConfig, GL_STATIC_DRAW); - glBindBufferBase(GL_UNIFORM_BUFFER, 16, GL_ShaderConfigUBO); - - glGenBuffers(1, &GL_ScreenVertexBufferID); - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBufferData(GL_ARRAY_BUFFER, sizeof(GL_ScreenVertices), NULL, GL_STATIC_DRAW); - - glGenVertexArrays(1, &GL_ScreenVertexArrayID); - glBindVertexArray(GL_ScreenVertexArrayID); - glEnableVertexAttribArray(0); // position - glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(0)); - glEnableVertexAttribArray(1); // texcoord - glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(2*4)); - - glGenTextures(1, &GL_ScreenTexture); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI, 256*3 + 1, 192*2, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, NULL); - - GL_ScreenSizeDirty = true; - - return true; -} - -void GLScreen_DeInit() -{ - glDeleteTextures(1, &GL_ScreenTexture); - - glDeleteVertexArrays(1, &GL_ScreenVertexArrayID); - glDeleteBuffers(1, &GL_ScreenVertexBufferID); - - OpenGL_DeleteShaderProgram(GL_ScreenShader); - OpenGL_DeleteShaderProgram(GL_ScreenShaderAccel); - OpenGL_DeleteShaderProgram(GL_ScreenShaderOSD); -} - -void GLScreen_DrawScreen() -{ - bool vsync = Config::ScreenVSync && !HotkeyDown(HK_FastForward); - if (vsync != GL_VSyncStatus) - { - GL_VSyncStatus = vsync; - uiGLSetVSync(vsync); - } - - float scale = uiGLGetFramebufferScale(GLContext); - - glBindFramebuffer(GL_FRAMEBUFFER, uiGLGetFramebuffer(GLContext)); - - if (GL_ScreenSizeDirty) - { - GL_ScreenSizeDirty = false; - - GL_ShaderConfig.uScreenSize[0] = WindowWidth; - GL_ShaderConfig.uScreenSize[1] = WindowHeight; - GL_ShaderConfig.u3DScale = GL_3DScale; - - glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO); - void* unibuf = glMapBuffer(GL_UNIFORM_BUFFER, GL_WRITE_ONLY); - if (unibuf) memcpy(unibuf, &GL_ShaderConfig, sizeof(GL_ShaderConfig)); - glUnmapBuffer(GL_UNIFORM_BUFFER); - - float scwidth, scheight; - - float x0, y0, x1, y1; - float s0, s1, s2, s3; - float t0, t1, t2, t3; - -#define SETVERTEX(i, x, y, s, t) \ - GL_ScreenVertices[4*(i) + 0] = x; \ - GL_ScreenVertices[4*(i) + 1] = y; \ - GL_ScreenVertices[4*(i) + 2] = s; \ - GL_ScreenVertices[4*(i) + 3] = t; - - x0 = TopScreenRect.X; - y0 = TopScreenRect.Y; - x1 = TopScreenRect.X + TopScreenRect.Width; - y1 = TopScreenRect.Y + TopScreenRect.Height; - - scwidth = 256; - scheight = 192; - - switch (ScreenRotation) - { - case 0: - s0 = 0; t0 = 0; - s1 = scwidth; t1 = 0; - s2 = 0; t2 = scheight; - s3 = scwidth; t3 = scheight; - break; - - case 1: - s0 = 0; t0 = scheight; - s1 = 0; t1 = 0; - s2 = scwidth; t2 = scheight; - s3 = scwidth; t3 = 0; - break; - - case 2: - s0 = scwidth; t0 = scheight; - s1 = 0; t1 = scheight; - s2 = scwidth; t2 = 0; - s3 = 0; t3 = 0; - break; - - case 3: - s0 = scwidth; t0 = 0; - s1 = scwidth; t1 = scheight; - s2 = 0; t2 = 0; - s3 = 0; t3 = scheight; - break; - } - - SETVERTEX(0, x0, y0, s0, t0); - SETVERTEX(1, x1, y1, s3, t3); - SETVERTEX(2, x1, y0, s1, t1); - SETVERTEX(3, x0, y0, s0, t0); - SETVERTEX(4, x0, y1, s2, t2); - SETVERTEX(5, x1, y1, s3, t3); - - x0 = BottomScreenRect.X; - y0 = BottomScreenRect.Y; - x1 = BottomScreenRect.X + BottomScreenRect.Width; - y1 = BottomScreenRect.Y + BottomScreenRect.Height; - - scwidth = 256; - scheight = 192; - - switch (ScreenRotation) - { - case 0: - s0 = 0; t0 = 192; - s1 = scwidth; t1 = 192; - s2 = 0; t2 = 192+scheight; - s3 = scwidth; t3 = 192+scheight; - break; - - case 1: - s0 = 0; t0 = 192+scheight; - s1 = 0; t1 = 192; - s2 = scwidth; t2 = 192+scheight; - s3 = scwidth; t3 = 192; - break; - - case 2: - s0 = scwidth; t0 = 192+scheight; - s1 = 0; t1 = 192+scheight; - s2 = scwidth; t2 = 192; - s3 = 0; t3 = 192; - break; - - case 3: - s0 = scwidth; t0 = 192; - s1 = scwidth; t1 = 192+scheight; - s2 = 0; t2 = 192; - s3 = 0; t3 = 192+scheight; - break; - } - - SETVERTEX(6, x0, y0, s0, t0); - SETVERTEX(7, x1, y1, s3, t3); - SETVERTEX(8, x1, y0, s1, t1); - SETVERTEX(9, x0, y0, s0, t0); - SETVERTEX(10, x0, y1, s2, t2); - SETVERTEX(11, x1, y1, s3, t3); - -#undef SETVERTEX - - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(GL_ScreenVertices), GL_ScreenVertices); - } - - glDisable(GL_DEPTH_TEST); - glDisable(GL_STENCIL_TEST); - glDisable(GL_BLEND); - glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - - glViewport(0, 0, WindowWidth*scale, WindowHeight*scale); - - if (GPU3D::Renderer == 0) - OpenGL_UseShaderProgram(GL_ScreenShader); - else - OpenGL_UseShaderProgram(GL_ScreenShaderAccel); - - glClearColor(0, 0, 0, 1); - glClear(GL_COLOR_BUFFER_BIT); - - if (RunningSomething) - { - int frontbuf = GPU::FrontBuffer; - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture); - - if (GPU::Framebuffer[frontbuf][0] && GPU::Framebuffer[frontbuf][1]) - { - if (GPU3D::Renderer == 0) - { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]); - } - else - { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]); - } - } - - glActiveTexture(GL_TEXTURE1); - if (GPU3D::Renderer != 0) - GPU3D::GLRenderer::SetupAccelFrame(); - - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBindVertexArray(GL_ScreenVertexArrayID); - glDrawArrays(GL_TRIANGLES, 0, 4*3); - } - - OpenGL_UseShaderProgram(GL_ScreenShaderOSD); - OSD::Update(true, NULL); - - glFlush(); - uiGLSwapBuffers(GLContext); -} - -void MicLoadWav(char* name) -{ - SDL_AudioSpec format; - memset(&format, 0, sizeof(SDL_AudioSpec)); - - if (MicWavBuffer) delete[] MicWavBuffer; - MicWavBuffer = NULL; - MicWavLength = 0; - - u8* buf; - u32 len; - if (!SDL_LoadWAV(name, &format, &buf, &len)) - return; - - const u64 dstfreq = 44100; - - if (format.format == AUDIO_S16 || format.format == AUDIO_U16) - { - int srcinc = format.channels; - len /= (2 * srcinc); - - MicWavLength = (len * dstfreq) / format.freq; - if (MicWavLength < 735) MicWavLength = 735; - MicWavBuffer = new s16[MicWavLength]; - - float res_incr = len / (float)MicWavLength; - float res_timer = 0; - int res_pos = 0; - - for (int i = 0; i < MicWavLength; i++) - { - u16 val = ((u16*)buf)[res_pos]; - if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000; - - MicWavBuffer[i] = val; - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos += srcinc; - } - } - } - else if (format.format == AUDIO_S8 || format.format == AUDIO_U8) - { - int srcinc = format.channels; - len /= srcinc; - - MicWavLength = (len * dstfreq) / format.freq; - if (MicWavLength < 735) MicWavLength = 735; - MicWavBuffer = new s16[MicWavLength]; - - float res_incr = len / (float)MicWavLength; - float res_timer = 0; - int res_pos = 0; - - for (int i = 0; i < MicWavLength; i++) - { - u16 val = buf[res_pos] << 8; - if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000; - - MicWavBuffer[i] = val; - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos += srcinc; - } - } - } - else - printf("bad WAV format %08X\n", format.format); - - SDL_FreeWAV(buf); -} - -void AudioCallback(void* data, Uint8* stream, int len) -{ - len /= (sizeof(s16) * 2); - - // resample incoming audio to match the output sample rate - - float f_len_in = (len * 32823.6328125) / (float)AudioFreq; - f_len_in += AudioSampleFrac; - int len_in = (int)floor(f_len_in); - AudioSampleFrac = f_len_in - len_in; - - s16 buf_in[1024*2]; - s16* buf_out = (s16*)stream; - - int num_in; - int num_out = len; - - SDL_LockMutex(AudioSyncLock); - num_in = SPU::ReadOutput(buf_in, len_in); - SDL_CondSignal(AudioSync); - SDL_UnlockMutex(AudioSyncLock); - - if (num_in < 1) - { - memset(stream, 0, len*sizeof(s16)*2); - return; - } - - int margin = 6; - if (num_in < len_in-margin) - { - int last = num_in-1; - if (last < 0) last = 0; - - for (int i = num_in; i < len_in-margin; i++) - ((u32*)buf_in)[i] = ((u32*)buf_in)[last]; - - num_in = len_in-margin; - } - - float res_incr = num_in / (float)num_out; - float res_timer = 0; - int res_pos = 0; - - int volume = Config::AudioVolume; - - for (int i = 0; i < len; i++) - { - buf_out[i*2 ] = (buf_in[res_pos*2 ] * volume) >> 8; - buf_out[i*2+1] = (buf_in[res_pos*2+1] * volume) >> 8; - - /*s16 s_l = buf_in[res_pos*2 ]; - s16 s_r = buf_in[res_pos*2+1]; - - float a = res_timer; - float b = 1.0 - a; - s_l = (s_l * a) + (buf_in[(res_pos-1)*2 ] * b); - s_r = (s_r * a) + (buf_in[(res_pos-1)*2+1] * b); - - buf_out[i*2 ] = (s_l * volume) >> 8; - buf_out[i*2+1] = (s_r * volume) >> 8;*/ - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos++; - } - } -} - -void MicCallback(void* data, Uint8* stream, int len) -{ - if (Config::MicInputType != 1) return; - - s16* input = (s16*)stream; - len /= sizeof(s16); - - if ((MicBufferWritePos + len) > MicBufferLength) - { - u32 len1 = MicBufferLength - MicBufferWritePos; - memcpy(&MicBuffer[MicBufferWritePos], &input[0], len1*sizeof(s16)); - memcpy(&MicBuffer[0], &input[len1], (len - len1)*sizeof(s16)); - MicBufferWritePos = len - len1; - } - else - { - memcpy(&MicBuffer[MicBufferWritePos], input, len*sizeof(s16)); - MicBufferWritePos += len; - } -} - -void FeedMicInput() -{ - int type = Config::MicInputType; - bool cmd = HotkeyDown(HK_Mic); - - if ((type != 1 && !cmd) || - (type == 1 && MicBufferLength == 0) || - (type == 3 && MicWavBuffer == NULL)) - { - type = 0; - MicBufferReadPos = 0; - } - - switch (type) - { - case 0: // no mic - NDS::MicInputFrame(NULL, 0); - break; - - case 1: // host mic - if ((MicBufferReadPos + 735) > MicBufferLength) - { - s16 tmp[735]; - u32 len1 = MicBufferLength - MicBufferReadPos; - memcpy(&tmp[0], &MicBuffer[MicBufferReadPos], len1*sizeof(s16)); - memcpy(&tmp[len1], &MicBuffer[0], (735 - len1)*sizeof(s16)); - - NDS::MicInputFrame(tmp, 735); - MicBufferReadPos = 735 - len1; - } - else - { - NDS::MicInputFrame(&MicBuffer[MicBufferReadPos], 735); - MicBufferReadPos += 735; - } - break; - - case 2: // white noise - { - s16 tmp[735]; - for (int i = 0; i < 735; i++) tmp[i] = rand() & 0xFFFF; - NDS::MicInputFrame(tmp, 735); - } - break; - - case 3: // WAV - if ((MicBufferReadPos + 735) > MicWavLength) - { - s16 tmp[735]; - u32 len1 = MicWavLength - MicBufferReadPos; - memcpy(&tmp[0], &MicWavBuffer[MicBufferReadPos], len1*sizeof(s16)); - memcpy(&tmp[len1], &MicWavBuffer[0], (735 - len1)*sizeof(s16)); - - NDS::MicInputFrame(tmp, 735); - MicBufferReadPos = 735 - len1; - } - else - { - NDS::MicInputFrame(&MicWavBuffer[MicBufferReadPos], 735); - MicBufferReadPos += 735; - } - break; - } -} - -void OpenJoystick() -{ - if (Joystick) SDL_JoystickClose(Joystick); - - int num = SDL_NumJoysticks(); - if (num < 1) - { - Joystick = NULL; - return; - } - - if (JoystickID >= num) - JoystickID = 0; - - Joystick = SDL_JoystickOpen(JoystickID); -} - -bool JoystickButtonDown(int val) -{ - if (val == -1) return false; - - bool hasbtn = ((val & 0xFFFF) != 0xFFFF); - - if (hasbtn) - { - if (val & 0x100) - { - int hatnum = (val >> 4) & 0xF; - int hatdir = val & 0xF; - Uint8 hatval = SDL_JoystickGetHat(Joystick, hatnum); - - bool pressed = false; - if (hatdir == 0x1) pressed = (hatval & SDL_HAT_UP); - else if (hatdir == 0x4) pressed = (hatval & SDL_HAT_DOWN); - else if (hatdir == 0x2) pressed = (hatval & SDL_HAT_RIGHT); - else if (hatdir == 0x8) pressed = (hatval & SDL_HAT_LEFT); - - if (pressed) return true; - } - else - { - int btnnum = val & 0xFFFF; - Uint8 btnval = SDL_JoystickGetButton(Joystick, btnnum); - - if (btnval) return true; - } - } - - if (val & 0x10000) - { - int axisnum = (val >> 24) & 0xF; - int axisdir = (val >> 20) & 0xF; - Sint16 axisval = SDL_JoystickGetAxis(Joystick, axisnum); - - switch (axisdir) - { - case 0: // positive - if (axisval > 16384) return true; - break; - - case 1: // negative - if (axisval < -16384) return true; - break; - - case 2: // trigger - if (axisval > 0) return true; - break; - } - } - - return false; -} - -void ProcessInput() -{ - SDL_JoystickUpdate(); - - if (Joystick) - { - if (!SDL_JoystickGetAttached(Joystick)) - { - SDL_JoystickClose(Joystick); - Joystick = NULL; - } - } - if (!Joystick && (SDL_NumJoysticks() > 0)) - { - JoystickID = Config::JoystickID; - OpenJoystick(); - } - - JoyInputMask = 0xFFF; - for (int i = 0; i < 12; i++) - if (JoystickButtonDown(Config::JoyMapping[i])) - JoyInputMask &= ~(1<> 4); - - bool pressed = false; - if (btnid == 0x101) // up - pressed = (hat & SDL_HAT_UP); - else if (btnid == 0x104) // down - pressed = (hat & SDL_HAT_DOWN); - else if (btnid == 0x102) // right - pressed = (hat & SDL_HAT_RIGHT); - else if (btnid == 0x108) // left - pressed = (hat & SDL_HAT_LEFT); - else if (btnid < njoybuttons) - pressed = (joybuttons[btnid] & ~(joybuttons[btnid] >> 1)) & 0x01; - - return pressed; -} - -bool JoyButtonHeld(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat) -{ - if (btnid < 0) return false; - - bool pressed = false; - if (btnid == 0x101) // up - pressed = (hat & SDL_HAT_UP); - else if (btnid == 0x104) // down - pressed = (hat & SDL_HAT_DOWN); - else if (btnid == 0x102) // right - pressed = (hat & SDL_HAT_RIGHT); - else if (btnid == 0x108) // left - pressed = (hat & SDL_HAT_LEFT); - else if (btnid < njoybuttons) - pressed = joybuttons[btnid] & 0x01; - - return pressed; -} - -void UpdateWindowTitle(void* data) -{ - if (EmuStatus == 0) return; - void** dataarray = (void**)data; - SDL_LockMutex((SDL_mutex*)dataarray[1]); - uiWindowSetTitle(MainWindow, (const char*)dataarray[0]); - SDL_UnlockMutex((SDL_mutex*)dataarray[1]); -} - -void UpdateFPSLimit(void* data) -{ - uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1); -} - -int EmuThreadFunc(void* burp) -{ - NDS::Init(); - - MainScreenPos[0] = 0; - MainScreenPos[1] = 0; - MainScreenPos[2] = 0; - AutoScreenSizing = 0; - - if (Screen_UseGL) - { - uiGLMakeContextCurrent(GLContext); - GPU3D::InitRenderer(true); - uiGLMakeContextCurrent(NULL); - } - else - { - GPU3D::InitRenderer(false); - } - - Touching = false; - KeyInputMask = 0xFFF; - JoyInputMask = 0xFFF; - KeyHotkeyMask = 0; - JoyHotkeyMask = 0; - HotkeyMask = 0; - LastHotkeyMask = 0; - LidStatus = false; - - u32 nframes = 0; - u32 starttick = SDL_GetTicks(); - u32 lasttick = starttick; - u32 lastmeasuretick = lasttick; - u32 fpslimitcount = 0; - u64 perfcount = SDL_GetPerformanceCounter(); - u64 perffreq = SDL_GetPerformanceFrequency(); - float samplesleft = 0; - u32 nsamples = 0; - - char melontitle[100]; - SDL_mutex* titlemutex = SDL_CreateMutex(); - void* titledata[2] = {melontitle, titlemutex}; - - while (EmuRunning != 0) - { - ProcessInput(); - - if (HotkeyPressed(HK_FastForwardToggle)) - { - Config::LimitFPS = !Config::LimitFPS; - uiQueueMain(UpdateFPSLimit, NULL); - } - // TODO: similar hotkeys for video/audio sync? - - if (HotkeyPressed(HK_Pause)) uiQueueMain(TogglePause, NULL); - if (HotkeyPressed(HK_Reset)) uiQueueMain(Reset, NULL); - - if (GBACart::CartInserted && GBACart::HasSolarSensor) - { - if (HotkeyPressed(HK_SolarSensorDecrease)) - { - if (GBACart_SolarSensor::LightLevel > 0) GBACart_SolarSensor::LightLevel--; - char msg[64]; - sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel); - OSD::AddMessage(0, msg); - } - if (HotkeyPressed(HK_SolarSensorIncrease)) - { - if (GBACart_SolarSensor::LightLevel < 10) GBACart_SolarSensor::LightLevel++; - char msg[64]; - sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel); - OSD::AddMessage(0, msg); - } - } - - if (EmuRunning == 1) - { - EmuStatus = 1; - - // process input and hotkeys - NDS::SetKeyMask(KeyInputMask & JoyInputMask); - - if (HotkeyPressed(HK_Lid)) - { - LidStatus = !LidStatus; - NDS::SetLidClosed(LidStatus); - OSD::AddMessage(0, LidStatus ? "Lid closed" : "Lid opened"); - } - - // microphone input - FeedMicInput(); - - if (Screen_UseGL) - { - uiGLBegin(GLContext); - uiGLMakeContextCurrent(GLContext); - } - - // auto screen layout - { - MainScreenPos[2] = MainScreenPos[1]; - MainScreenPos[1] = MainScreenPos[0]; - MainScreenPos[0] = NDS::PowerControl9 >> 15; - - int guess; - if (MainScreenPos[0] == MainScreenPos[2] && - MainScreenPos[0] != MainScreenPos[1]) - { - // constant flickering, likely displaying 3D on both screens - // TODO: when both screens are used for 2D only...??? - guess = 0; - } - else - { - if (MainScreenPos[0] == 1) - guess = 1; - else - guess = 2; - } - - if (guess != AutoScreenSizing) - { - AutoScreenSizing = guess; - SetupScreenRects(WindowWidth, WindowHeight); - } - } - - // emulate - u32 nlines = NDS::RunFrame(); - -#ifdef MELONCAP - MelonCap::Update(); -#endif // MELONCAP - - if (EmuRunning == 0) break; - - if (Screen_UseGL) - { - GLScreen_DrawScreen(); - uiGLEnd(GLContext); - } - uiAreaQueueRedrawAll(MainDrawArea); - - bool fastforward = HotkeyDown(HK_FastForward); - - if (Config::AudioSync && !fastforward) - { - SDL_LockMutex(AudioSyncLock); - while (SPU::GetOutputSize() > 1024) - { - int ret = SDL_CondWaitTimeout(AudioSync, AudioSyncLock, 500); - if (ret == SDL_MUTEX_TIMEDOUT) break; - } - SDL_UnlockMutex(AudioSyncLock); - } - else - { - // ensure the audio FIFO doesn't overflow - //SPU::TrimOutput(); - } - - float framerate = (1000.0f * nlines) / (60.0f * 263.0f); - - { - u32 curtick = SDL_GetTicks(); - u32 delay = curtick - lasttick; - - bool limitfps = Config::LimitFPS && !fastforward; - if (limitfps) - { - float wantedtickF = starttick + (framerate * (fpslimitcount+1)); - u32 wantedtick = (u32)ceil(wantedtickF); - if (curtick < wantedtick) SDL_Delay(wantedtick - curtick); - - lasttick = SDL_GetTicks(); - fpslimitcount++; - if ((abs(wantedtickF - (float)wantedtick) < 0.001312) || (fpslimitcount > 60)) - { - fpslimitcount = 0; - nsamples = 0; - starttick = lasttick; - } - } - else - { - if (delay < 1) SDL_Delay(1); - lasttick = SDL_GetTicks(); - } - } - - nframes++; - if (nframes >= 30) - { - u32 tick = SDL_GetTicks(); - u32 diff = tick - lastmeasuretick; - lastmeasuretick = tick; - - u32 fps; - if (diff < 1) fps = 77777; - else fps = (nframes * 1000) / diff; - nframes = 0; - - float fpstarget; - if (framerate < 1) fpstarget = 999; - else fpstarget = 1000.0f/framerate; - - SDL_LockMutex(titlemutex); - sprintf(melontitle, "[%d/%.0f] melonDS " MELONDS_VERSION, fps, fpstarget); - SDL_UnlockMutex(titlemutex); - uiQueueMain(UpdateWindowTitle, titledata); - } - } - else - { - // paused - nframes = 0; - lasttick = SDL_GetTicks(); - starttick = lasttick; - lastmeasuretick = lasttick; - fpslimitcount = 0; - - if (EmuRunning == 2) - { - if (Screen_UseGL) - { - uiGLBegin(GLContext); - uiGLMakeContextCurrent(GLContext); - GLScreen_DrawScreen(); - uiGLEnd(GLContext); - } - uiAreaQueueRedrawAll(MainDrawArea); - } - - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - EmuStatus = EmuRunning; - - SDL_Delay(100); - } - } - - EmuStatus = 0; - - SDL_DestroyMutex(titlemutex); - - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - - NDS::DeInit(); - Platform::LAN_DeInit(); - - if (Screen_UseGL) - { - OSD::DeInit(true); - GLScreen_DeInit(); - } - else - OSD::DeInit(false); - - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - return 44203; -} - -void StopEmuThread() -{ - EmuRunning = 0; - SDL_WaitThread(EmuThread, NULL); -} - - -void OnAreaDraw(uiAreaHandler* handler, uiArea* area, uiAreaDrawParams* params) -{ - if (!ScreenDrawInited) - { - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - ScreenDrawInited = true; - ScreenBitmap[0] = uiDrawNewBitmap(params->Context, 256, 192, 0); - ScreenBitmap[1] = uiDrawNewBitmap(params->Context, 256, 192, 0); - } - - int frontbuf = GPU::FrontBuffer; - if (!ScreenBitmap[0] || !ScreenBitmap[1]) return; - if (!GPU::Framebuffer[frontbuf][0] || !GPU::Framebuffer[frontbuf][1]) return; - - uiRect top = {0, 0, 256, 192}; - uiRect bot = {0, 0, 256, 192}; - - uiDrawBitmapUpdate(ScreenBitmap[0], GPU::Framebuffer[frontbuf][0]); - uiDrawBitmapUpdate(ScreenBitmap[1], GPU::Framebuffer[frontbuf][1]); - - uiDrawSave(params->Context); - uiDrawTransform(params->Context, &TopScreenTrans); - uiDrawBitmapDraw(params->Context, ScreenBitmap[0], &top, &TopScreenRect, Config::ScreenFilter==1); - uiDrawRestore(params->Context); - - uiDrawSave(params->Context); - uiDrawTransform(params->Context, &BottomScreenTrans); - uiDrawBitmapDraw(params->Context, ScreenBitmap[1], &bot, &BottomScreenRect, Config::ScreenFilter==1); - uiDrawRestore(params->Context); - - OSD::Update(false, params); -} - -void OnAreaMouseEvent(uiAreaHandler* handler, uiArea* area, uiAreaMouseEvent* evt) -{ - int x = (int)evt->X; - int y = (int)evt->Y; - - if (Touching && (evt->Up == 1)) - { - Touching = false; - NDS::ReleaseKey(16+6); - NDS::ReleaseScreen(); - } - else if (!Touching && (evt->Down == 1) && - (x >= BottomScreenRect.X) && (y >= BottomScreenRect.Y) && - (x < (BottomScreenRect.X+BottomScreenRect.Width)) && (y < (BottomScreenRect.Y+BottomScreenRect.Height))) - { - Touching = true; - NDS::PressKey(16+6); - } - - if (Touching) - { - x -= BottomScreenRect.X; - y -= BottomScreenRect.Y; - - if (ScreenRotation == 0 || ScreenRotation == 2) - { - if (BottomScreenRect.Width != 256) - x = (x * 256) / BottomScreenRect.Width; - if (BottomScreenRect.Height != 192) - y = (y * 192) / BottomScreenRect.Height; - - if (ScreenRotation == 2) - { - x = 255 - x; - y = 191 - y; - } - } - else - { - if (BottomScreenRect.Width != 192) - x = (x * 192) / BottomScreenRect.Width; - if (BottomScreenRect.Height != 256) - y = (y * 256) / BottomScreenRect.Height; - - if (ScreenRotation == 1) - { - int tmp = x; - x = y; - y = 191 - tmp; - } - else - { - int tmp = x; - x = 255 - y; - y = tmp; - } - } - - // clamp - if (x < 0) x = 0; - else if (x > 255) x = 255; - if (y < 0) y = 0; - else if (y > 191) y = 191; - - // TODO: take advantage of possible extra precision when possible? (scaled window for example) - NDS::TouchScreen(x, y); - } -} - -void OnAreaMouseCrossed(uiAreaHandler* handler, uiArea* area, int left) -{ -} - -void OnAreaDragBroken(uiAreaHandler* handler, uiArea* area) -{ -} - -bool EventMatchesKey(uiAreaKeyEvent* evt, int val, bool checkmod) -{ - if (val == -1) return false; - - int key = val & 0xFFFF; - int mod = val >> 16; - return evt->Scancode == key && (!checkmod || evt->Modifiers == mod); -} - -int OnAreaKeyEvent(uiAreaHandler* handler, uiArea* area, uiAreaKeyEvent* evt) -{ - // TODO: release all keys if the window loses focus? or somehow global key input? - if (evt->Scancode == 0x38) // ALT - return 0; - if (evt->Modifiers == 0x2) // ALT+key - return 0; - - if (evt->Up) - { - for (int i = 0; i < 12; i++) - if (EventMatchesKey(evt, Config::KeyMapping[i], false)) - KeyInputMask |= (1<Repeat) - { - // TODO, eventually: make savestate keys configurable? - // F keys: 3B-44, 57-58 | SHIFT: mod. 0x4 - if (evt->Scancode >= 0x3B && evt->Scancode <= 0x42) // F1-F8, quick savestate - { - if (evt->Modifiers == 0x4) SaveState(1 + (evt->Scancode - 0x3B)); - else if (evt->Modifiers == 0x0) LoadState(1 + (evt->Scancode - 0x3B)); - } - else if (evt->Scancode == 0x43) // F9, savestate from/to file - { - if (evt->Modifiers == 0x4) SaveState(0); - else if (evt->Modifiers == 0x0) LoadState(0); - } - else if (evt->Scancode == 0x58) // F12, undo savestate - { - if (evt->Modifiers == 0x0) UndoStateLoad(); - } - - for (int i = 0; i < 12; i++) - if (EventMatchesKey(evt, Config::KeyMapping[i], false)) - KeyInputMask &= ~(1<Scancode == 0x57) // F11 - // NDS::debug(0); - } - - return 1; -} - -void SetupScreenRects(int width, int height) -{ - bool horizontal = false; - bool sideways = false; - - if (ScreenRotation == 1 || ScreenRotation == 3) - sideways = true; - - if (ScreenLayout == 2) horizontal = true; - else if (ScreenLayout == 0) - { - if (sideways) - horizontal = true; - } - - int sizemode; - if (ScreenSizing == 3) - sizemode = AutoScreenSizing; - else - sizemode = ScreenSizing; - - int screenW, screenH, gap; - if (sideways) - { - screenW = 192; - screenH = 256; - } - else - { - screenW = 256; - screenH = 192; - } - - gap = ScreenGap; - - uiRect *topscreen, *bottomscreen; - if (ScreenRotation == 1 || ScreenRotation == 2) - { - topscreen = &BottomScreenRect; - bottomscreen = &TopScreenRect; - } - else - { - topscreen = &TopScreenRect; - bottomscreen = &BottomScreenRect; - } - - if (horizontal) - { - // side-by-side - - int heightreq; - int startX = 0; - - width -= gap; - - if (sizemode == 0) // even - { - heightreq = (width * screenH) / (screenW*2); - if (heightreq > height) - { - int newwidth = (height * width) / heightreq; - startX = (width - newwidth) / 2; - heightreq = height; - width = newwidth; - } - } - else // emph. top/bottom - { - heightreq = ((width - screenW) * screenH) / screenW; - if (heightreq > height) - { - int newwidth = ((height * (width - screenW)) / heightreq) + screenW; - startX = (width - newwidth) / 2; - heightreq = height; - width = newwidth; - } - } - - if (sizemode == 2) - { - topscreen->Width = screenW; - topscreen->Height = screenH; - } - else - { - topscreen->Width = (sizemode==0) ? (width / 2) : (width - screenW); - topscreen->Height = heightreq; - } - topscreen->X = startX; - topscreen->Y = ((height - heightreq) / 2) + (heightreq - topscreen->Height); - - bottomscreen->X = topscreen->X + topscreen->Width + gap; - - if (sizemode == 1) - { - bottomscreen->Width = screenW; - bottomscreen->Height = screenH; - } - else - { - bottomscreen->Width = width - topscreen->Width; - bottomscreen->Height = heightreq; - } - bottomscreen->Y = ((height - heightreq) / 2) + (heightreq - bottomscreen->Height); - } - else - { - // top then bottom - - int widthreq; - int startY = 0; - - height -= gap; - - if (sizemode == 0) // even - { - widthreq = (height * screenW) / (screenH*2); - if (widthreq > width) - { - int newheight = (width * height) / widthreq; - startY = (height - newheight) / 2; - widthreq = width; - height = newheight; - } - } - else // emph. top/bottom - { - widthreq = ((height - screenH) * screenW) / screenH; - if (widthreq > width) - { - int newheight = ((width * (height - screenH)) / widthreq) + screenH; - startY = (height - newheight) / 2; - widthreq = width; - height = newheight; - } - } - - if (sizemode == 2) - { - topscreen->Width = screenW; - topscreen->Height = screenH; - } - else - { - topscreen->Width = widthreq; - topscreen->Height = (sizemode==0) ? (height / 2) : (height - screenH); - } - topscreen->Y = startY; - topscreen->X = (width - topscreen->Width) / 2; - - bottomscreen->Y = topscreen->Y + topscreen->Height + gap; - - if (sizemode == 1) - { - bottomscreen->Width = screenW; - bottomscreen->Height = screenH; - } - else - { - bottomscreen->Width = widthreq; - bottomscreen->Height = height - topscreen->Height; - } - bottomscreen->X = (width - bottomscreen->Width) / 2; - } - - // setup matrices for potential rotation - - uiDrawMatrixSetIdentity(&TopScreenTrans); - uiDrawMatrixSetIdentity(&BottomScreenTrans); - - switch (ScreenRotation) - { - case 1: // 90° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI/2.0f); - uiDrawMatrixScale(&TopScreenTrans, 0, 0, - TopScreenRect.Width/(double)TopScreenRect.Height, - TopScreenRect.Height/(double)TopScreenRect.Width); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI/2.0f); - uiDrawMatrixScale(&BottomScreenTrans, 0, 0, - BottomScreenRect.Width/(double)BottomScreenRect.Height, - BottomScreenRect.Height/(double)BottomScreenRect.Width); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y); - } - break; - - case 2: // 180° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y+TopScreenRect.Height); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y+BottomScreenRect.Height); - } - break; - - case 3: // 270° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, -M_PI/2.0f); - uiDrawMatrixScale(&TopScreenTrans, 0, 0, - TopScreenRect.Width/(double)TopScreenRect.Height, - TopScreenRect.Height/(double)TopScreenRect.Width); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X, TopScreenRect.Y+TopScreenRect.Height); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, -M_PI/2.0f); - uiDrawMatrixScale(&BottomScreenTrans, 0, 0, - BottomScreenRect.Width/(double)BottomScreenRect.Height, - BottomScreenRect.Height/(double)BottomScreenRect.Width); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X, BottomScreenRect.Y+BottomScreenRect.Height); - } - break; - } - - GL_ScreenSizeDirty = true; -} - -void SetMinSize(int w, int h) -{ - int cw, ch; - uiWindowContentSize(MainWindow, &cw, &ch); - - uiControlSetMinSize(uiControl(MainDrawArea), w, h); - if ((cw < w) || (ch < h)) - { - if (cw < w) cw = w; - if (ch < h) ch = h; - uiWindowSetContentSize(MainWindow, cw, ch); - } -} - -void OnAreaResize(uiAreaHandler* handler, uiArea* area, int width, int height) -{ - SetupScreenRects(width, height); - - // TODO: - // should those be the size of the uiArea, or the size of the window client area? - // for now the uiArea fills the whole window anyway - // but... we never know, I guess - WindowWidth = width; - WindowHeight = height; - - int ismax = uiWindowMaximized(MainWindow); - int ismin = uiWindowMinimized(MainWindow); - - Config::WindowMaximized = ismax; - if (!ismax && !ismin) - { - Config::WindowWidth = width; - Config::WindowHeight = height; - } - - OSD::WindowResized(Screen_UseGL); -} - - -void Run() -{ - EmuRunning = 1; - RunningSomething = true; - - SPU::InitOutput(); - AudioSampleFrac = 0; - SDL_PauseAudioDevice(AudioDevice, 0); - SDL_PauseAudioDevice(MicDevice, 0); - - uiMenuItemEnable(MenuItem_SaveState); - uiMenuItemEnable(MenuItem_LoadState); - - if (SavestateLoaded) - uiMenuItemEnable(MenuItem_UndoStateLoad); - else - uiMenuItemDisable(MenuItem_UndoStateLoad); - - for (int i = 0; i < 8; i++) - { - char ssfile[1024]; - GetSavestateName(i+1, ssfile, 1024); - if (Platform::FileExists(ssfile)) uiMenuItemEnable(MenuItem_LoadStateSlot[i]); - else uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - } - - for (int i = 0; i < 9; i++) uiMenuItemEnable(MenuItem_SaveStateSlot[i]); - uiMenuItemEnable(MenuItem_LoadStateSlot[8]); - - uiMenuItemEnable(MenuItem_Pause); - uiMenuItemEnable(MenuItem_Reset); - uiMenuItemEnable(MenuItem_Stop); - uiMenuItemSetChecked(MenuItem_Pause, 0); -} - -void TogglePause(void* blarg) -{ - if (!RunningSomething) return; - - if (EmuRunning == 1) - { - // enable pause - EmuRunning = 2; - uiMenuItemSetChecked(MenuItem_Pause, 1); - - SPU::DrainOutput(); - SDL_PauseAudioDevice(AudioDevice, 1); - SDL_PauseAudioDevice(MicDevice, 1); - - OSD::AddMessage(0, "Paused"); - } - else - { - // disable pause - EmuRunning = 1; - uiMenuItemSetChecked(MenuItem_Pause, 0); - - SPU::InitOutput(); - AudioSampleFrac = 0; - SDL_PauseAudioDevice(AudioDevice, 0); - SDL_PauseAudioDevice(MicDevice, 0); - - OSD::AddMessage(0, "Resumed"); - } -} - -void Reset(void* blarg) -{ - if (!RunningSomething) return; - - EmuRunning = 2; - while (EmuStatus != 2); - - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - if (ROMPath[0][0] == '\0') - NDS::LoadBIOS(); - else - { - SetupSRAMPath(0); - NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot); - } - - if (ROMPath[1][0] != '\0') - { - SetupSRAMPath(1); - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - - Run(); - - OSD::AddMessage(0, "Reset"); -} - -void Stop(bool internal) -{ - EmuRunning = 2; - if (!internal) // if shutting down from the UI thread, wait till the emu thread has stopped - while (EmuStatus != 2); - RunningSomething = false; - - // eject any inserted GBA cartridge - GBACart::Eject(); - ROMPath[1][0] = '\0'; - - uiWindowSetTitle(MainWindow, "melonDS " MELONDS_VERSION); - - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]); - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - uiMenuItemDisable(MenuItem_UndoStateLoad); - - uiMenuItemDisable(MenuItem_Pause); - uiMenuItemDisable(MenuItem_Reset); - uiMenuItemDisable(MenuItem_Stop); - uiMenuItemSetChecked(MenuItem_Pause, 0); - - uiAreaQueueRedrawAll(MainDrawArea); - - SPU::DrainOutput(); - SDL_PauseAudioDevice(AudioDevice, 1); - SDL_PauseAudioDevice(MicDevice, 1); - - OSD::AddMessage(0xFFC040, "Shutdown"); -} - -void SetupSRAMPath(int slot) -{ - strncpy(SRAMPath[slot], ROMPath[slot], 1023); - SRAMPath[slot][1023] = '\0'; - strncpy(SRAMPath[slot] + strlen(ROMPath[slot]) - 3, "sav", 3); -} - -void TryLoadROM(char* file, int slot, int prevstatus) -{ - char oldpath[1024]; - char oldsram[1024]; - strncpy(oldpath, ROMPath[slot], 1024); - strncpy(oldsram, SRAMPath[slot], 1024); - - strncpy(ROMPath[slot], file, 1023); - ROMPath[slot][1023] = '\0'; - - SetupSRAMPath(0); - SetupSRAMPath(1); - - if (slot == 0 && NDS::LoadROM(ROMPath[slot], SRAMPath[slot], Config::DirectBoot)) - { - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - // Reload the inserted GBA cartridge (if any) - if (ROMPath[1][0] != '\0') NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - - strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety - Run(); - } - else if (slot == 1 && NDS::LoadGBAROM(ROMPath[slot], SRAMPath[slot])) - { - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety - if (RunningSomething) Run(); // do not start just from a GBA cart - } - else - { - uiMsgBoxError(MainWindow, - "Failed to load the ROM", - "Make sure the file can be accessed and isn't opened in another application."); - - strncpy(ROMPath[slot], oldpath, 1024); - strncpy(SRAMPath[slot], oldsram, 1024); - EmuRunning = prevstatus; - } -} - - -// SAVESTATE TODO -// * configurable paths. not everyone wants their ROM directory to be polluted, I guess. - -void GetSavestateName(int slot, char* filename, int len) -{ - int pos; - - if (ROMPath[0][0] == '\0') // running firmware, no ROM - { - strcpy(filename, "firmware"); - pos = 8; - } - else - { - int l = strlen(ROMPath[0]); - pos = l; - while (ROMPath[0][pos] != '.' && pos > 0) pos--; - if (pos == 0) pos = l; - - // avoid buffer overflow. shoddy - if (pos > len-5) pos = len-5; - - strncpy(&filename[0], ROMPath[0], pos); - } - strcpy(&filename[pos], ".ml"); - filename[pos+3] = '0'+slot; - filename[pos+4] = '\0'; -} - -void LoadState(int slot) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char filename[1024]; - - if (slot > 0) - { - GetSavestateName(slot, filename, 1024); - } - else - { - char* file = uiOpenFile(MainWindow, "melonDS savestate (any)|*.ml1;*.ml2;*.ml3;*.ml4;*.ml5;*.ml6;*.ml7;*.ml8;*.mln", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - strncpy(filename, file, 1023); - filename[1023] = '\0'; - uiFreeText(file); - } - - if (!Platform::FileExists(filename)) - { - char msg[64]; - if (slot > 0) sprintf(msg, "State slot %d is empty", slot); - else sprintf(msg, "State file does not exist"); - OSD::AddMessage(0xFFA0A0, msg); - - EmuRunning = prevstatus; - return; - } - - u32 oldGBACartCRC = GBACart::CartCRC; - - // backup - Savestate* backup = new Savestate("timewarp.mln", true); - NDS::DoSavestate(backup); - delete backup; - - bool failed = false; - - Savestate* state = new Savestate(filename, false); - if (state->Error) - { - delete state; - - uiMsgBoxError(MainWindow, "Error", "Could not load savestate file."); - - // current state might be crapoed, so restore from sane backup - state = new Savestate("timewarp.mln", false); - failed = true; - } - - NDS::DoSavestate(state); - delete state; - - if (!failed) - { - if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0') - { - strncpy(PrevSRAMPath[0], SRAMPath[0], 1024); - - strncpy(SRAMPath[0], filename, 1019); - int len = strlen(SRAMPath[0]); - strcpy(&SRAMPath[0][len], ".sav"); - SRAMPath[0][len+4] = '\0'; - - NDS::RelocateSave(SRAMPath[0], false); - } - - bool loadedPartialGBAROM = false; - - // in case we have a GBA cart inserted, and the GBA ROM changes - // due to having loaded a save state, we do not want to reload - // the previous cartridge on reset, or commit writes to any - // loaded save file. therefore, their paths are "nulled". - if (GBACart::CartInserted && GBACart::CartCRC != oldGBACartCRC) - { - ROMPath[1][0] = '\0'; - SRAMPath[1][0] = '\0'; - loadedPartialGBAROM = true; - } - - char msg[64]; - if (slot > 0) sprintf(msg, "State loaded from slot %d%s", - slot, loadedPartialGBAROM ? " (GBA ROM header only)" : ""); - else sprintf(msg, "State loaded from file%s", - loadedPartialGBAROM ? " (GBA ROM header only)" : ""); - OSD::AddMessage(0, msg); - - SavestateLoaded = true; - uiMenuItemEnable(MenuItem_UndoStateLoad); - } - - EmuRunning = prevstatus; -} - -void SaveState(int slot) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char filename[1024]; - - if (slot > 0) - { - GetSavestateName(slot, filename, 1024); - } - else - { - char* file = uiSaveFile(MainWindow, "melonDS savestate (*.mln)|*.mln", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - strncpy(filename, file, 1023); - filename[1023] = '\0'; - uiFreeText(file); - } - - Savestate* state = new Savestate(filename, true); - if (state->Error) - { - delete state; - - uiMsgBoxError(MainWindow, "Error", "Could not save state."); - } - else - { - NDS::DoSavestate(state); - delete state; - - if (slot > 0) - uiMenuItemEnable(MenuItem_LoadStateSlot[slot-1]); - - if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0') - { - strncpy(SRAMPath[0], filename, 1019); - int len = strlen(SRAMPath[0]); - strcpy(&SRAMPath[0][len], ".sav"); - SRAMPath[0][len+4] = '\0'; - - NDS::RelocateSave(SRAMPath[0], true); - } - } - - char msg[64]; - if (slot > 0) sprintf(msg, "State saved to slot %d", slot); - else sprintf(msg, "State saved to file"); - OSD::AddMessage(0, msg); - - EmuRunning = prevstatus; -} - -void UndoStateLoad() -{ - if (!SavestateLoaded) return; - - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - // pray that this works - // what do we do if it doesn't??? - // but it should work. - Savestate* backup = new Savestate("timewarp.mln", false); - NDS::DoSavestate(backup); - delete backup; - - if (ROMPath[0][0]!='\0') - { - strncpy(SRAMPath[0], PrevSRAMPath[0], 1024); - NDS::RelocateSave(SRAMPath[0], false); - } - - OSD::AddMessage(0, "State load undone"); - - EmuRunning = prevstatus; -} - - -void CloseAllDialogs() -{ - DlgAudioSettings::Close(); - DlgEmuSettings::Close(); - DlgInputConfig::Close(0); - DlgInputConfig::Close(1); - DlgVideoSettings::Close(); - DlgWifiSettings::Close(); -} - - -int OnCloseWindow(uiWindow* window, void* blarg) -{ - EmuRunning = 3; - while (EmuStatus != 3); - - CloseAllDialogs(); - StopEmuThread(); - uiQuit(); - return 1; -} - -void OnDropFile(uiWindow* window, char* file, void* blarg) -{ - char* ext = &file[strlen(file)-3]; - int prevstatus = EmuRunning; - - if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl")) - { - if (RunningSomething) - { - EmuRunning = 2; - while (EmuStatus != 2); - } - - TryLoadROM(file, 0, prevstatus); - } - else if (!strcasecmp(ext, "gba")) - { - TryLoadROM(file, 1, prevstatus); - } -} - -void OnGetFocus(uiWindow* window, void* blarg) -{ - uiControlSetFocus(uiControl(MainDrawArea)); -} - -void OnLoseFocus(uiWindow* window, void* blarg) -{ - // TODO: shit here? -} - -void OnCloseByMenu(uiMenuItem* item, uiWindow* window, void* blarg) -{ - EmuRunning = 3; - while (EmuStatus != 3); - - CloseAllDialogs(); - StopEmuThread(); - DestroyMainWindow(); - uiQuit(); -} - -void OnOpenFile(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char* file = uiOpenFile(window, "DS ROM (*.nds)|*.nds;*.srl|GBA ROM (*.gba)|*.gba|Any file|*.*", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - int pos = strlen(file)-1; - while (file[pos] != '/' && file[pos] != '\\' && pos > 0) pos--; - strncpy(Config::LastROMFolder, file, pos); - Config::LastROMFolder[pos] = '\0'; - char* ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "gba")) - { - TryLoadROM(file, 1, prevstatus); - } - else - { - TryLoadROM(file, 0, prevstatus); - } - - uiFreeText(file); -} - -void OnSaveState(uiMenuItem* item, uiWindow* window, void* param) -{ - int slot = *(int*)param; - SaveState(slot); -} - -void OnLoadState(uiMenuItem* item, uiWindow* window, void* param) -{ - int slot = *(int*)param; - LoadState(slot); -} - -void OnUndoStateLoad(uiMenuItem* item, uiWindow* window, void* param) -{ - UndoStateLoad(); -} - -void OnRun(uiMenuItem* item, uiWindow* window, void* blarg) -{ - if (!RunningSomething) - { - ROMPath[0][0] = '\0'; - NDS::LoadBIOS(); - - if (ROMPath[1][0] != '\0') - { - SetupSRAMPath(1); - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - } - - Run(); -} - -void OnPause(uiMenuItem* item, uiWindow* window, void* blarg) -{ - TogglePause(NULL); -} - -void OnReset(uiMenuItem* item, uiWindow* window, void* blarg) -{ - Reset(NULL); -} - -void OnStop(uiMenuItem* item, uiWindow* window, void* blarg) -{ - if (!RunningSomething) return; - - Stop(false); -} - -void OnOpenEmuSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgEmuSettings::Open(); -} - -void OnOpenInputConfig(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgInputConfig::Open(0); -} - -void OnOpenHotkeyConfig(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgInputConfig::Open(1); -} - -void OnOpenVideoSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgVideoSettings::Open(); -} - -void OnOpenAudioSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgAudioSettings::Open(); -} - -void OnOpenWifiSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgWifiSettings::Open(); -} - - -void OnSetSavestateSRAMReloc(uiMenuItem* item, uiWindow* window, void* param) -{ - Config::SavestateRelocSRAM = uiMenuItemChecked(item) ? 1:0; -} - - -void EnsureProperMinSize() -{ - bool isHori = (ScreenRotation == 1 || ScreenRotation == 3); - - int w0 = 256; - int h0 = 192; - int w1 = 256; - int h1 = 192; - - if (ScreenLayout == 0) // natural - { - if (isHori) - SetMinSize(h0+ScreenGap+h1, std::max(w0,w1)); - else - SetMinSize(std::max(w0,w1), h0+ScreenGap+h1); - } - else if (ScreenLayout == 1) // vertical - { - if (isHori) - SetMinSize(std::max(h0,h1), w0+ScreenGap+w1); - else - SetMinSize(std::max(w0,w1), h0+ScreenGap+h1); - } - else // horizontal - { - if (isHori) - SetMinSize(h0+ScreenGap+h1, std::max(w0,w1)); - else - SetMinSize(w0+ScreenGap+w1, std::max(h0,h1)); - } -} - -void OnSetScreenSize(uiMenuItem* item, uiWindow* window, void* param) -{ - int factor = *(int*)param; - bool isHori = (ScreenRotation == 1 || ScreenRotation == 3); - - int w = 256*factor; - int h = 192*factor; - - // FIXME - - if (ScreenLayout == 0) // natural - { - if (isHori) - uiWindowSetContentSize(window, (h*2)+ScreenGap, w); - else - uiWindowSetContentSize(window, w, (h*2)+ScreenGap); - } - else if (ScreenLayout == 1) // vertical - { - if (isHori) - uiWindowSetContentSize(window, h, (w*2)+ScreenGap); - else - uiWindowSetContentSize(window, w, (h*2)+ScreenGap); - } - else // horizontal - { - if (isHori) - uiWindowSetContentSize(window, (h*2)+ScreenGap, w); - else - uiWindowSetContentSize(window, (w*2)+ScreenGap, h); - } -} - -void OnSetScreenRotation(uiMenuItem* item, uiWindow* window, void* param) -{ - int rot = *(int*)param; - - int oldrot = ScreenRotation; - ScreenRotation = rot; - - int w, h; - uiWindowContentSize(window, &w, &h); - - bool isHori = (rot == 1 || rot == 3); - bool wasHori = (oldrot == 1 || oldrot == 3); - - EnsureProperMinSize(); - - if (ScreenLayout == 0) // natural - { - if (isHori ^ wasHori) - { - int blarg = h; - h = w; - w = blarg; - - uiWindowSetContentSize(window, w, h); - } - } - - SetupScreenRects(w, h); - - for (int i = 0; i < 4; i++) - uiMenuItemSetChecked(MenuItem_ScreenRot[i], i==ScreenRotation); -} - -void OnSetScreenGap(uiMenuItem* item, uiWindow* window, void* param) -{ - int gap = *(int*)param; - - //int oldgap = ScreenGap; - ScreenGap = gap; - - EnsureProperMinSize(); - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 6; i++) - uiMenuItemSetChecked(MenuItem_ScreenGap[i], kScreenGap[i]==ScreenGap); -} - -void OnSetScreenLayout(uiMenuItem* item, uiWindow* window, void* param) -{ - int layout = *(int*)param; - ScreenLayout = layout; - - EnsureProperMinSize(); - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 3; i++) - uiMenuItemSetChecked(MenuItem_ScreenLayout[i], i==ScreenLayout); -} - -void OnSetScreenSizing(uiMenuItem* item, uiWindow* window, void* param) -{ - int sizing = *(int*)param; - ScreenSizing = sizing; - - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 4; i++) - uiMenuItemSetChecked(MenuItem_ScreenSizing[i], i==ScreenSizing); -} - -void OnSetScreenFiltering(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::ScreenFilter = 1; - else Config::ScreenFilter = 0; -} - -void OnSetLimitFPS(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::LimitFPS = true; - else Config::LimitFPS = false; -} - -void OnSetAudioSync(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::AudioSync = true; - else Config::AudioSync = false; -} - -void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::ShowOSD = true; - else Config::ShowOSD = false; -} - -void ApplyNewSettings(int type) -{ -#ifdef JIT_ENABLED - if (type == 4) - { - Reset(NULL); - return; - } -#endif - - if (!RunningSomething) - { - if (type == 1) return; - } - - int prevstatus = EmuRunning; - EmuRunning = 3; - while (EmuStatus != 3); - - if (type == 0) // 3D renderer settings - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::UpdateRendererConfig(); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - GL_3DScale = Config::GL_ScaleFactor; // dorp - GL_ScreenSizeDirty = true; - } - else if (type == 1) // wifi settings - { - if (Wifi::MPInited) - { - Platform::MP_DeInit(); - Platform::MP_Init(); - } - - Platform::LAN_DeInit(); - Platform::LAN_Init(); - } - else if (type == 2) // video output method - { - bool usegl = Config::ScreenUseGL || (Config::_3DRenderer != 0); - if (usegl != Screen_UseGL) - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::DeInitRenderer(); - OSD::DeInit(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - Screen_UseGL = usegl; - RecreateMainWindow(usegl); - - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::InitRenderer(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - } - } - else if (type == 3) // 3D renderer - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::DeInitRenderer(); - GPU3D::InitRenderer(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - } - EmuRunning = prevstatus; -} - - -void CreateMainWindowMenu() -{ - uiMenu* menu; - uiMenuItem* menuitem; - - menu = uiNewMenu("File"); - menuitem = uiMenuAppendItem(menu, "Open ROM..."); - uiMenuItemOnClicked(menuitem, OnOpenFile, NULL); - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Save state"); - - for (int i = 0; i < 9; i++) - { - char name[32]; - if (i < 8) - sprintf(name, "%d\tShift+F%d", kSavestateNum[i], kSavestateNum[i]); - else - strcpy(name, "File...\tShift+F9"); - - uiMenuItem* ssitem = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(ssitem, OnSaveState, (void*)&kSavestateNum[i]); - - MenuItem_SaveStateSlot[i] = ssitem; - } - - MenuItem_SaveState = uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Load state"); - - for (int i = 0; i < 9; i++) - { - char name[32]; - if (i < 8) - sprintf(name, "%d\tF%d", kSavestateNum[i], kSavestateNum[i]); - else - strcpy(name, "File...\tF9"); - - uiMenuItem* ssitem = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(ssitem, OnLoadState, (void*)&kSavestateNum[i]); - - MenuItem_LoadStateSlot[i] = ssitem; - } - - MenuItem_LoadState = uiMenuAppendSubmenu(menu, submenu); - } - menuitem = uiMenuAppendItem(menu, "Undo state load\tF12"); - uiMenuItemOnClicked(menuitem, OnUndoStateLoad, NULL); - MenuItem_UndoStateLoad = menuitem; - uiMenuAppendSeparator(menu); - menuitem = uiMenuAppendItem(menu, "Quit"); - uiMenuItemOnClicked(menuitem, OnCloseByMenu, NULL); - - menu = uiNewMenu("System"); - menuitem = uiMenuAppendItem(menu, "Run"); - uiMenuItemOnClicked(menuitem, OnRun, NULL); - menuitem = uiMenuAppendCheckItem(menu, "Pause"); - uiMenuItemOnClicked(menuitem, OnPause, NULL); - MenuItem_Pause = menuitem; - uiMenuAppendSeparator(menu); - menuitem = uiMenuAppendItem(menu, "Reset"); - uiMenuItemOnClicked(menuitem, OnReset, NULL); - MenuItem_Reset = menuitem; - menuitem = uiMenuAppendItem(menu, "Stop"); - uiMenuItemOnClicked(menuitem, OnStop, NULL); - MenuItem_Stop = menuitem; - - menu = uiNewMenu("Config"); - { - menuitem = uiMenuAppendItem(menu, "Emu settings"); - uiMenuItemOnClicked(menuitem, OnOpenEmuSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Input config"); - uiMenuItemOnClicked(menuitem, OnOpenInputConfig, NULL); - menuitem = uiMenuAppendItem(menu, "Hotkey config"); - uiMenuItemOnClicked(menuitem, OnOpenHotkeyConfig, NULL); - menuitem = uiMenuAppendItem(menu, "Video settings"); - uiMenuItemOnClicked(menuitem, OnOpenVideoSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Audio settings"); - uiMenuItemOnClicked(menuitem, OnOpenAudioSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Wifi settings"); - uiMenuItemOnClicked(menuitem, OnOpenWifiSettings, NULL); - } - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Savestate settings"); - - MenuItem_SavestateSRAMReloc = uiMenuAppendCheckItem(submenu, "Separate savefiles"); - uiMenuItemOnClicked(MenuItem_SavestateSRAMReloc, OnSetSavestateSRAMReloc, NULL); - - uiMenuAppendSubmenu(menu, submenu); - } - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Screen size"); - - for (int i = 0; i < 4; i++) - { - char name[32]; - sprintf(name, "%dx", kScreenSize[i]); - uiMenuItem* item = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(item, OnSetScreenSize, (void*)&kScreenSize[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen rotation"); - - for (int i = 0; i < 4; i++) - { - char name[32]; - sprintf(name, "%d", kScreenRot[i]*90); - MenuItem_ScreenRot[i] = uiMenuAppendCheckItem(submenu, name); - uiMenuItemOnClicked(MenuItem_ScreenRot[i], OnSetScreenRotation, (void*)&kScreenRot[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Mid-screen gap"); - - //for (int i = 0; kScreenGap[i] != -1; i++) - for (int i = 0; i < 6; i++) - { - char name[32]; - sprintf(name, "%d pixels", kScreenGap[i]); - MenuItem_ScreenGap[i] = uiMenuAppendCheckItem(submenu, name); - uiMenuItemOnClicked(MenuItem_ScreenGap[i], OnSetScreenGap, (void*)&kScreenGap[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen layout"); - - MenuItem_ScreenLayout[0] = uiMenuAppendCheckItem(submenu, "Natural"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[0], OnSetScreenLayout, (void*)&kScreenLayout[0]); - MenuItem_ScreenLayout[1] = uiMenuAppendCheckItem(submenu, "Vertical"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[1], OnSetScreenLayout, (void*)&kScreenLayout[1]); - MenuItem_ScreenLayout[2] = uiMenuAppendCheckItem(submenu, "Horizontal"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[2], OnSetScreenLayout, (void*)&kScreenLayout[2]); - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen sizing"); - - MenuItem_ScreenSizing[0] = uiMenuAppendCheckItem(submenu, "Even"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[0], OnSetScreenSizing, (void*)&kScreenSizing[0]); - MenuItem_ScreenSizing[1] = uiMenuAppendCheckItem(submenu, "Emphasize top"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[1], OnSetScreenSizing, (void*)&kScreenSizing[1]); - MenuItem_ScreenSizing[2] = uiMenuAppendCheckItem(submenu, "Emphasize bottom"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[2], OnSetScreenSizing, (void*)&kScreenSizing[2]); - MenuItem_ScreenSizing[3] = uiMenuAppendCheckItem(submenu, "Auto"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[3], OnSetScreenSizing, (void*)&kScreenSizing[3]); - - uiMenuAppendSubmenu(menu, submenu); - } - - MenuItem_ScreenFilter = uiMenuAppendCheckItem(menu, "Screen filtering"); - uiMenuItemOnClicked(MenuItem_ScreenFilter, OnSetScreenFiltering, NULL); - - MenuItem_ShowOSD = uiMenuAppendCheckItem(menu, "Show OSD"); - uiMenuItemOnClicked(MenuItem_ShowOSD, OnSetShowOSD, NULL); - - uiMenuAppendSeparator(menu); - - MenuItem_LimitFPS = uiMenuAppendCheckItem(menu, "Limit framerate"); - uiMenuItemOnClicked(MenuItem_LimitFPS, OnSetLimitFPS, NULL); - - MenuItem_AudioSync = uiMenuAppendCheckItem(menu, "Audio sync"); - uiMenuItemOnClicked(MenuItem_AudioSync, OnSetAudioSync, NULL); -} - -void CreateMainWindow(bool opengl) -{ - MainWindow = uiNewWindow("melonDS " MELONDS_VERSION, - WindowWidth, WindowHeight, - Config::WindowMaximized, 1, 1); - uiWindowOnClosing(MainWindow, OnCloseWindow, NULL); - - uiWindowSetDropTarget(MainWindow, 1); - uiWindowOnDropFile(MainWindow, OnDropFile, NULL); - - uiWindowOnGetFocus(MainWindow, OnGetFocus, NULL); - uiWindowOnLoseFocus(MainWindow, OnLoseFocus, NULL); - - ScreenDrawInited = false; - bool opengl_good = opengl; - - if (!opengl) MainDrawArea = uiNewArea(&MainDrawAreaHandler); - else MainDrawArea = uiNewGLArea(&MainDrawAreaHandler, kGLVersions); - - uiWindowSetChild(MainWindow, uiControl(MainDrawArea)); - uiControlSetMinSize(uiControl(MainDrawArea), 256, 384); - uiAreaSetBackgroundColor(MainDrawArea, 0, 0, 0); - - uiControlShow(uiControl(MainWindow)); - uiControlSetFocus(uiControl(MainDrawArea)); - - if (opengl_good) - { - GLContext = uiAreaGetGLContext(MainDrawArea); - if (!GLContext) opengl_good = false; - } - if (opengl_good) - { - uiGLMakeContextCurrent(GLContext); - uiGLSetVSync(Config::ScreenVSync); - if (!GLScreen_Init()) opengl_good = false; - if (opengl_good) - { - OpenGL_UseShaderProgram(GL_ScreenShaderOSD); - OSD::Init(true); - } - uiGLMakeContextCurrent(NULL); - } - - if (opengl && !opengl_good) - { - printf("OpenGL: initialization failed\n"); - RecreateMainWindow(false); - Screen_UseGL = false; - } - - if (!opengl) OSD::Init(false); -} - -void DestroyMainWindow() -{ - uiControlDestroy(uiControl(MainWindow)); - - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - ScreenBitmap[0] = NULL; - ScreenBitmap[1] = NULL; -} - -void RecreateMainWindow(bool opengl) -{ - int winX, winY, maxi; - uiWindowPosition(MainWindow, &winX, &winY); - maxi = uiWindowMaximized(MainWindow); - DestroyMainWindow(); - CreateMainWindow(opengl); - uiWindowSetPosition(MainWindow, winX, winY); - uiWindowSetMaximized(MainWindow, maxi); -} - - -int main(int argc, char** argv) -{ - srand(time(NULL)); - - printf("melonDS " MELONDS_VERSION "\n"); - printf(MELONDS_URL "\n"); - -#if defined(__WIN32__) || defined(UNIX_PORTABLE) - if (argc > 0 && strlen(argv[0]) > 0) - { - int len = strlen(argv[0]); - while (len > 0) - { - if (argv[0][len] == '/') break; - if (argv[0][len] == '\\') break; - len--; - } - if (len > 0) - { - EmuDirectory = new char[len+1]; - strncpy(EmuDirectory, argv[0], len); - EmuDirectory[len] = '\0'; - } - else - { - EmuDirectory = new char[2]; - strcpy(EmuDirectory, "."); - } - } - else - { - EmuDirectory = new char[2]; - strcpy(EmuDirectory, "."); - } -#else - const char* confdir = g_get_user_config_dir(); - const char* confname = "/melonDS"; - EmuDirectory = new char[strlen(confdir) + strlen(confname) + 1]; - strcat(EmuDirectory, confdir); - strcat(EmuDirectory, confname); -#endif - - // http://stackoverflow.com/questions/14543333/joystick-wont-work-using-sdl - SDL_SetHint(SDL_HINT_JOYSTICK_ALLOW_BACKGROUND_EVENTS, "1"); - - if (SDL_Init(SDL_INIT_HAPTIC) < 0) - { - printf("SDL couldn't init rumble\n"); - } - if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_JOYSTICK) < 0) - { - printf("SDL shat itself :(\n"); - return 1; - } - - SDL_JoystickEventState(SDL_ENABLE); - - uiInitOptions ui_opt; - memset(&ui_opt, 0, sizeof(uiInitOptions)); - const char* ui_err = uiInit(&ui_opt); - if (ui_err != NULL) - { - printf("libui shat itself :( %s\n", ui_err); - uiFreeInitError(ui_err); - return 1; - } - - Config::Load(); - - if (Config::AudioVolume < 0) Config::AudioVolume = 0; - else if (Config::AudioVolume > 256) Config::AudioVolume = 256; - - if (!Platform::LocalFileExists("bios7.bin") || - !Platform::LocalFileExists("bios9.bin") || - !Platform::LocalFileExists("firmware.bin")) - { -#if defined(__WIN32__) || defined(UNIX_PORTABLE) - const char* locationName = "the directory you run melonDS from"; -#else - char* locationName = EmuDirectory; -#endif - char msgboxtext[512]; - sprintf(msgboxtext, - "One or more of the following required files don't exist or couldn't be accessed:\n\n" - "bios7.bin -- ARM7 BIOS\n" - "bios9.bin -- ARM9 BIOS\n" - "firmware.bin -- firmware image\n\n" - "Dump the files from your DS and place them in %s.\n" - "Make sure that the files can be accessed.", - locationName - ); - - uiMsgBoxError(NULL, "BIOS/Firmware not found", msgboxtext); - - uiUninit(); - SDL_Quit(); - return 0; - } - if (!Platform::LocalFileExists("firmware.bin.bak")) - { - // verify the firmware - // - // there are dumps of an old hacked firmware floating around on the internet - // and those are problematic - // the hack predates WFC, and, due to this, any game that alters the WFC - // access point data will brick that firmware due to it having critical - // data in the same area. it has the same problem on hardware. - // - // but this should help stop users from reporting that issue over and over - // again, when the issue is not from melonDS but from their firmware dump. - // - // I don't know about all the firmware hacks in existence, but the one I - // looked at has 0x180 bytes from the header repeated at 0x3FC80, but - // bytes 0x0C-0x14 are different. - - FILE* f = Platform::OpenLocalFile("firmware.bin", "rb"); - u8 chk1[0x180], chk2[0x180]; - - fseek(f, 0, SEEK_SET); - fread(chk1, 1, 0x180, f); - fseek(f, -0x380, SEEK_END); - fread(chk2, 1, 0x180, f); - - memset(&chk1[0x0C], 0, 8); - memset(&chk2[0x0C], 0, 8); - - fclose(f); - - if (!memcmp(chk1, chk2, 0x180)) - { - uiMsgBoxError(NULL, - "Problematic firmware dump", - "You are using an old hacked firmware dump.\n" - "Firmware boot will stop working if you run any game that alters WFC settings.\n\n" - "Note that the issue is not from melonDS, it would also happen on an actual DS."); - } - } - { - const char* romlist_missing = "Save memory type detection will not work correctly.\n\n" - "You should use the latest version of romlist.bin (provided in melonDS release packages)."; -#if !defined(UNIX_PORTABLE) && !defined(__WIN32__) - std::string missingstr = std::string(romlist_missing) + - "\n\nThe ROM list should be placed in " + g_get_user_data_dir() + "/melonds/, otherwise " - "melonDS will search for it in the current working directory."; - const char* romlist_missing_text = missingstr.c_str(); -#else - const char* romlist_missing_text = romlist_missing; -#endif - - FILE* f = Platform::OpenDataFile("romlist.bin"); - if (f) - { - u32 data; - fread(&data, 4, 1, f); - fclose(f); - - if ((data >> 24) == 0) // old CRC-based list - { - uiMsgBoxError(NULL, "Your version of romlist.bin is outdated.", romlist_missing_text); - } - } - else - { - uiMsgBoxError(NULL, "romlist.bin not found.", romlist_missing_text); - } - } - - CreateMainWindowMenu(); - - MainDrawAreaHandler.Draw = OnAreaDraw; - MainDrawAreaHandler.MouseEvent = OnAreaMouseEvent; - MainDrawAreaHandler.MouseCrossed = OnAreaMouseCrossed; - MainDrawAreaHandler.DragBroken = OnAreaDragBroken; - MainDrawAreaHandler.KeyEvent = OnAreaKeyEvent; - MainDrawAreaHandler.Resize = OnAreaResize; - - WindowWidth = Config::WindowWidth; - WindowHeight = Config::WindowHeight; - - Screen_UseGL = Config::ScreenUseGL || (Config::_3DRenderer != 0); - - GL_3DScale = Config::GL_ScaleFactor; - if (GL_3DScale < 1) GL_3DScale = 1; - else if (GL_3DScale > 8) GL_3DScale = 8; - - CreateMainWindow(Screen_UseGL); - - ScreenRotation = Config::ScreenRotation; - ScreenGap = Config::ScreenGap; - ScreenLayout = Config::ScreenLayout; - ScreenSizing = Config::ScreenSizing; - -#define SANITIZE(var, min, max) if ((var < min) || (var > max)) var = 0; - SANITIZE(ScreenRotation, 0, 3); - SANITIZE(ScreenLayout, 0, 2); - SANITIZE(ScreenSizing, 0, 3); -#undef SANITIZE - - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]); - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - uiMenuItemDisable(MenuItem_UndoStateLoad); - - uiMenuItemDisable(MenuItem_Pause); - uiMenuItemDisable(MenuItem_Reset); - uiMenuItemDisable(MenuItem_Stop); - - uiMenuItemSetChecked(MenuItem_SavestateSRAMReloc, Config::SavestateRelocSRAM?1:0); - - uiMenuItemSetChecked(MenuItem_ScreenRot[ScreenRotation], 1); - uiMenuItemSetChecked(MenuItem_ScreenLayout[ScreenLayout], 1); - uiMenuItemSetChecked(MenuItem_ScreenSizing[ScreenSizing], 1); - - for (int i = 0; i < 6; i++) - { - if (ScreenGap == kScreenGap[i]) - uiMenuItemSetChecked(MenuItem_ScreenGap[i], 1); - } - - OnSetScreenRotation(MenuItem_ScreenRot[ScreenRotation], MainWindow, (void*)&kScreenRot[ScreenRotation]); - - uiMenuItemSetChecked(MenuItem_ScreenFilter, Config::ScreenFilter==1); - uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1); - uiMenuItemSetChecked(MenuItem_AudioSync, Config::AudioSync==1); - uiMenuItemSetChecked(MenuItem_ShowOSD, Config::ShowOSD==1); - -#ifdef MELONCAP - MelonCap::Init(); -#endif // MELONCAP - - AudioSync = SDL_CreateCond(); - AudioSyncLock = SDL_CreateMutex(); - - AudioFreq = 48000; // TODO: make configurable? - SDL_AudioSpec whatIwant, whatIget; - memset(&whatIwant, 0, sizeof(SDL_AudioSpec)); - whatIwant.freq = AudioFreq; - whatIwant.format = AUDIO_S16LSB; - whatIwant.channels = 2; - whatIwant.samples = 1024; - whatIwant.callback = AudioCallback; - AudioDevice = SDL_OpenAudioDevice(NULL, 0, &whatIwant, &whatIget, SDL_AUDIO_ALLOW_FREQUENCY_CHANGE); - if (!AudioDevice) - { - printf("Audio init failed: %s\n", SDL_GetError()); - } - else - { - AudioFreq = whatIget.freq; - printf("Audio output frequency: %d Hz\n", AudioFreq); - SDL_PauseAudioDevice(AudioDevice, 1); - } - - memset(&whatIwant, 0, sizeof(SDL_AudioSpec)); - whatIwant.freq = 44100; - whatIwant.format = AUDIO_S16LSB; - whatIwant.channels = 1; - whatIwant.samples = 1024; - whatIwant.callback = MicCallback; - MicDevice = SDL_OpenAudioDevice(NULL, 1, &whatIwant, &whatIget, 0); - if (!MicDevice) - { - printf("Mic init failed: %s\n", SDL_GetError()); - MicBufferLength = 0; - } - else - { - SDL_PauseAudioDevice(MicDevice, 1); - } - - memset(MicBuffer, 0, sizeof(MicBuffer)); - MicBufferReadPos = 0; - MicBufferWritePos = 0; - - MicWavBuffer = NULL; - if (Config::MicInputType == 3) MicLoadWav(Config::MicWavPath); - - JoystickID = Config::JoystickID; - Joystick = NULL; - OpenJoystick(); - - EmuRunning = 2; - RunningSomething = false; - EmuThread = SDL_CreateThread(EmuThreadFunc, "melonDS magic", NULL); - - if (argc > 1) - { - char* file = argv[1]; - char* ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl")) - { - strncpy(ROMPath[0], file, 1023); - ROMPath[0][1023] = '\0'; - - SetupSRAMPath(0); - - if (NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot)) - Run(); - } - - if (argc > 2) - { - file = argv[2]; - ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "gba")) - { - strncpy(ROMPath[1], file, 1023); - ROMPath[1][1023] = '\0'; - - SetupSRAMPath(1); - - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - } - } - - uiMain(); - - if (Joystick) SDL_JoystickClose(Joystick); - if (AudioDevice) SDL_CloseAudioDevice(AudioDevice); - if (MicDevice) SDL_CloseAudioDevice(MicDevice); - - SDL_DestroyCond(AudioSync); - SDL_DestroyMutex(AudioSyncLock); - - if (MicWavBuffer) delete[] MicWavBuffer; - -#ifdef MELONCAP - MelonCap::DeInit(); -#endif // MELONCAP - - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - Config::ScreenRotation = ScreenRotation; - Config::ScreenGap = ScreenGap; - Config::ScreenLayout = ScreenLayout; - Config::ScreenSizing = ScreenSizing; - - Config::Save(); - - uiUninit(); - SDL_Quit(); - delete[] EmuDirectory; - return 0; -} - -#ifdef __WIN32__ - -#include - -int CALLBACK WinMain(HINSTANCE hinst, HINSTANCE hprev, LPSTR cmdline, int cmdshow) -{ - int argc = 0; - wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc); - char* nullarg = ""; - - char** argv = new char*[argc]; - for (int i = 0; i < argc; i++) - { - int len = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, NULL, 0, NULL, NULL); - if (len < 1) return NULL; - argv[i] = new char[len]; - int res = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, argv[i], len, NULL, NULL); - if (res != len) { delete[] argv[i]; argv[i] = nullarg; } - } - - if (AttachConsole(ATTACH_PARENT_PROCESS)) - { - freopen("CONOUT$", "w", stdout); - freopen("CONOUT$", "w", stderr); - printf("\n"); - } - - int ret = main(argc, argv); - - printf("\n\n>"); - - for (int i = 0; i < argc; i++) if (argv[i] != nullarg) delete[] argv[i]; - delete[] argv; - - return ret; -} - -#endif -- cgit v1.2.3