diff options
author | Arisotura <thetotalworm@gmail.com> | 2020-07-01 00:01:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-01 00:01:11 +0200 |
commit | 62c6e2f703d88660e0ca9bda78032c5bd6b63a78 (patch) | |
tree | 1dbf9eb1bbe418d14f07dc3a0e30821fb5deb258 | |
parent | d97ce22b010e868437c649911bce89d679a4deaa (diff) | |
parent | c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd (diff) |
Merge pull request #667 from Arisotura/generic_jit
merge jit
66 files changed, 27783 insertions, 460 deletions
@@ -9,3 +9,5 @@ melon_grc.h cmake-build cmake-build-debug .idea + +*.exe diff --git a/CMakeLists.txt b/CMakeLists.txt index 885f0dd..6729e73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,42 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +include(CheckSymbolExists) +function(detect_architecture symbol arch) + if (NOT DEFINED ARCHITECTURE) + set(CMAKE_REQUIRED_QUIET 1) + check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch}) + unset(CMAKE_REQUIRED_QUIET) + + # The output variable needs to be unique across invocations otherwise + # CMake's crazy scope rules will keep it defined + if (ARCHITECTURE_${arch}) + set(ARCHITECTURE "${arch}" PARENT_SCOPE) + set(ARCHITECTURE_${arch} 1 PARENT_SCOPE) + add_definitions(-DARCHITECTURE_${arch}=1) + endif() + endif() +endfunction() + +detect_architecture("__x86_64__" x86_64) +detect_architecture("__i386__" x86) +detect_architecture("__arm__" ARM) +detect_architecture("__aarch64__" ARM64) + +if (ARCHITECTURE STREQUAL x86_64 OR ARCHITECTURE STREQUAL ARM64) + option(ENABLE_JIT "Enable x64 JIT recompiler" ON) +endif() + +if (ENABLE_JIT) + add_definitions(-DJIT_ENABLED) +endif() + +if (CMAKE_BUILD_TYPE STREQUAL Release) + option(ENABLE_LTO "Enable link-time optimization" ON) +else() + option(ENABLE_LTO "Enable link-time optimization" OFF) +endif() + if (CMAKE_BUILD_TYPE STREQUAL Debug) add_compile_options(-Og) endif() diff --git a/src/ARM.cpp b/src/ARM.cpp index 68cac59..8530795 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,8 +21,15 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" +#include "Config.h" #include "AREngine.h" +#include "ARMJIT.h" +#include "Config.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif // instruction timing notes // @@ -72,7 +79,9 @@ ARM::~ARM() ARMv5::ARMv5() : ARM(0) { - // +#ifndef JIT_ENABLED + DTCM = new u8[DTCMSize]; +#endif } ARMv4::ARMv4() : ARM(1) @@ -80,6 +89,13 @@ ARMv4::ARMv4() : ARM(1) // } +ARMv5::~ARMv5() +{ +#ifndef JIT_ENABLED + delete[] DTCM; +#endif +} + void ARM::Reset() { Cycles = 0; @@ -96,6 +112,12 @@ void ARM::Reset() CodeMem.Mem = NULL; +#ifdef JIT_ENABLED + FastBlockLookup = NULL; + FastBlockLookupStart = 0; + FastBlockLookupSize = 0; +#endif + // zorp JumpTo(ExceptionBase); } @@ -123,7 +145,6 @@ void ARMv5::Reset() GetMemRegion = NDS::ARM9GetMemRegion; } - CP15Reset(); ARM::Reset(); } @@ -158,7 +179,11 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + + // hack to make save states compatible + u32 halted = Halted; + file->Var32(&halted); + Halted = halted; file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -168,6 +193,15 @@ void ARM::DoSavestate(Savestate* file) file->VarArray(R_IRQ, 3*sizeof(u32)); file->VarArray(R_UND, 3*sizeof(u32)); file->Var32(&CurInstr); +#ifdef JIT_ENABLED + if (!file->Saving && Config::JIT_Enable) + { + // hack, the JIT doesn't really pipeline + // but we still want JIT save states to be + // loaded while running the interpreter + FillPipeline(); + } +#endif file->VarArray(NextInstr, 2*sizeof(u32)); file->Var32(&ExceptionBase); @@ -240,15 +274,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x2) { NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; } else { NextInstr[0] = CodeRead32(addr, true); NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; } CPSR |= 0x20; @@ -261,9 +295,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; CPSR &= ~0x20; } @@ -303,7 +337,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead16(addr); NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; } @@ -316,7 +350,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead32(addr); NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; } @@ -558,7 +592,7 @@ void ARMv5::Execute() else AddCycles_C(); } - + // TODO optimize this shit!!! if (Halted) { @@ -575,7 +609,7 @@ void ARMv5::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM9Timestamp += Cycles; + NDS::ARM9Timestamp -= Cycles; Cycles = 0; } @@ -583,6 +617,75 @@ void ARMv5::Execute() Halted = 0; } +#ifdef JIT_ENABLED +void ARMv5::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(0)) + { + Halted = 0; + if (NDS::IME[0] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM9Timestamp = NDS::ARM9Target; + return; + } + } + + while (NDS::ARM9Timestamp < NDS::ARM9Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + printf("ARMv5 PC in non executable region %08X\n", R[15]); + return; + } + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); + if (block) + ARM_Dispatch(this, block); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1; + + if (StopExecution) + { + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) + { + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; + } + } + } + + if (Halted == 2) + Halted = 0; +} +#endif + void ARMv4::Execute() { if (Halted) @@ -652,10 +755,131 @@ void ARMv4::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM7Timestamp += Cycles; + NDS::ARM7Timestamp -= Cycles; Cycles = 0; } if (Halted == 2) Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } +} + +#ifdef JIT_ENABLED +void ARMv4::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(1)) + { + Halted = 0; + if (NDS::IME[1] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM7Timestamp = NDS::ARM7Target; + return; + } + } + + while (NDS::ARM7Timestamp < NDS::ARM7Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + printf("ARMv4 PC in non executable region %08X\n", R[15]); + return; + } + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); + if (block) + ARM_Dispatch(this, block); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1; + + // TODO optimize this shit!!! + if (StopExecution) + { + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) + { + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; + } + } + } + + if (Halted == 2) + Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } +} +#endif + +void ARMv5::FillPipeline() +{ + SetupCodeMem(R[15]); + + if (CPSR & 0x20) + { + if ((R[15] - 2) & 0x2) + { + NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16; + NextInstr[1] = CodeRead32(R[15], false); + } + else + { + NextInstr[0] = CodeRead32(R[15] - 2, false); + NextInstr[1] = NextInstr[0] >> 16; + } + } + else + { + NextInstr[0] = CodeRead32(R[15] - 4, false); + NextInstr[1] = CodeRead32(R[15], false); + } } + +void ARMv4::FillPipeline() +{ + SetupCodeMem(R[15]); + + if (CPSR & 0x20) + { + NextInstr[0] = CodeRead16(R[15] - 2); + NextInstr[1] = CodeRead16(R[15]); + } + else + { + NextInstr[0] = CodeRead32(R[15] - 4); + NextInstr[1] = CodeRead32(R[15]); + } +}
\ No newline at end of file @@ -32,16 +32,21 @@ enum RWFlags_ForceUser = (1<<21), }; +const u32 ITCMPhysicalSize = 0x8000; +const u32 DTCMPhysicalSize = 0x4000; + class ARM { public: ARM(u32 num); - ~ARM(); // destroy shit + virtual ~ARM(); // destroy shit virtual void Reset(); virtual void DoSavestate(Savestate* file); + virtual void FillPipeline() = 0; + virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; void RestoreCPSR(); @@ -52,6 +57,9 @@ public: } virtual void Execute() = 0; +#ifdef ENABLE_JIT + virtual void ExecuteJIT() = 0; +#endif bool CheckCondition(u32 code) { @@ -107,9 +115,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; @@ -131,6 +146,11 @@ public: NDS::MemRegion CodeMem; +#ifdef JIT_ENABLED + u32 FastBlockLookupStart, FastBlockLookupSize; + u64* FastBlockLookup; +#endif + static u32 ConditionTable[16]; protected: @@ -146,6 +166,7 @@ class ARMv5 : public ARM { public: ARMv5(); + ~ARMv5(); void Reset(); @@ -153,12 +174,17 @@ public: void UpdateRegionTimings(u32 addrstart, u32 addrend); + void FillPipeline(); + void JumpTo(u32 addr, bool restorecpsr = false); void PrefetchAbort(); void DataAbort(); void Execute(); +#ifdef JIT_ENABLED + void ExecuteJIT(); +#endif // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -176,14 +202,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; + Cycles -= numC; } void AddCycles_CI(s32 numI) { // code+internal s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + Cycles -= numC + numI; } void AddCycles_CDI() @@ -194,9 +220,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void AddCycles_CD() @@ -206,9 +232,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void GetCodeMemRegion(u32 addr, NDS::MemRegion* region); @@ -237,10 +263,14 @@ public: u32 DTCMSetting, ITCMSetting; - u8 ITCM[0x8000]; + // for aarch64 JIT they need to go up here + // to be addressable by a 12-bit immediate u32 ITCMSize; - u8 DTCM[0x4000]; u32 DTCMBase, DTCMSize; + s32 RegionCodeCycles; + + u8 ITCM[ITCMPhysicalSize]; + u8* DTCM; u8 ICache[0x2000]; u32 ICacheTags[64*4]; @@ -265,7 +295,6 @@ public: // code/16N/32N/32S u8 MemTimings[0x100000][4]; - s32 RegionCodeCycles; u8* CurICacheLine; bool (*GetMemRegion)(u32 addr, bool write, NDS::MemRegion* region); @@ -278,9 +307,14 @@ public: void Reset(); + void FillPipeline(); + void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); +#ifdef JIT_ENABLED + void ExecuteJIT(); +#endif u16 CodeRead16(u32 addr) { @@ -295,8 +329,8 @@ public: void DataRead8(u32 addr, u32* val) { *val = BusRead8(addr); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -304,8 +338,8 @@ public: addr &= ~1; *val = BusRead16(addr); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -313,8 +347,8 @@ public: addr &= ~3; *val = BusRead32(addr); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -322,14 +356,14 @@ public: addr &= ~3; *val = BusRead32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { BusWrite8(addr, val); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -337,8 +371,8 @@ public: addr &= ~1; BusWrite16(addr, val); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -346,8 +380,8 @@ public: addr &= ~3; BusWrite32(addr, val); - DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataRegion = addr; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -355,20 +389,20 @@ public: addr &= ~3; BusWrite32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void AddCycles_C() { // code only. this code fetch is sequential. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; } void AddCycles_CI(s32 num) { // code+internal. results in a nonseq code fetch. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; } void AddCycles_CDI() @@ -377,24 +411,24 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if (DataRegion == 0x02) // mainRAM + if ((DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else { numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } } else if (CodeRegion == 0x02) { numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD + 1; + Cycles -= numC + numD + 1; } } @@ -404,20 +438,20 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if (DataRegion == 0x02) + if ((DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else if (CodeRegion == 0x02) { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD; + Cycles -= numC + numD; } } }; @@ -430,4 +464,12 @@ void T_UNK(ARM* cpu); } +namespace NDS +{ + +extern ARMv5* ARM9; +extern ARMv4* ARM7; + +} + #endif // ARM_H diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp new file mode 100644 index 0000000..2a61c38 --- /dev/null +++ b/src/ARMJIT.cpp @@ -0,0 +1,1204 @@ +#include "ARMJIT.h" + +#include <string.h> +#include <assert.h> +#include <unordered_map> + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" + +#include "Config.h" + +#include "ARMJIT_Internal.h" +#include "ARMJIT_Memory.h" +#include "ARMJIT_Compiler.h" + +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "DSi.h" +#include "GPU.h" +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" +#include "NDSCart.h" + +#include "ARMJIT_x64/ARMJIT_Offsets.h" +static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset); +static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset); +static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset); + +namespace ARMJIT +{ + +#define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) + +Compiler* JITCompiler; + +AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512]; +AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; +AddressRange CodeIndexVRAM[0x100000 / 512]; +AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; +AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; +AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; +AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; +AddressRange CodeIndexBIOS9DSi[0x10000 / 512]; +AddressRange CodeIndexBIOS7DSi[0x10000 / 512]; +AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512]; + +std::unordered_map<u32, JitBlock*> JitBlocks9; +std::unordered_map<u32, JitBlock*> JitBlocks7; + +u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2]; +u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; +u64 FastBlockLookupVRAM[0x100000 / 2]; +u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; +u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; +u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; +u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; +u64 FastBlockLookupBIOS9DSi[0x10000 / 2]; +u64 FastBlockLookupBIOS7DSi[0x10000 / 2]; +u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2]; + +const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = +{ + 0, + ITCMPhysicalSize, + 0, + sizeof(NDS::ARM9BIOS), + NDS::MainRAMMaxSize, + NDS::SharedWRAMSize, + 0, + 0x100000, + sizeof(NDS::ARM7BIOS), + NDS::ARM7WRAMSize, + 0, + 0, + 0x40000, + 0x10000, + 0x10000, + sizeof(DSi::NWRAM_A), + sizeof(DSi::NWRAM_B), + sizeof(DSi::NWRAM_C), +}; + +AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = +{ + NULL, + CodeIndexITCM, + NULL, + CodeIndexARM9BIOS, + CodeIndexMainRAM, + CodeIndexSWRAM, + NULL, + CodeIndexVRAM, + CodeIndexARM7BIOS, + CodeIndexARM7WRAM, + NULL, + NULL, + CodeIndexARM7WVRAM, + CodeIndexBIOS9DSi, + CodeIndexBIOS7DSi, + CodeIndexNWRAM_A, + CodeIndexNWRAM_B, + CodeIndexNWRAM_C +}; + +u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = +{ + NULL, + FastBlockLookupITCM, + NULL, + FastBlockLookupARM9BIOS, + FastBlockLookupMainRAM, + FastBlockLookupSWRAM, + NULL, + FastBlockLookupVRAM, + FastBlockLookupARM7BIOS, + FastBlockLookupARM7WRAM, + NULL, + NULL, + FastBlockLookupARM7WVRAM, + FastBlockLookupBIOS9DSi, + FastBlockLookupBIOS7DSi, + FastBlockLookupNWRAM_A, + FastBlockLookupNWRAM_B, + FastBlockLookupNWRAM_C +}; + +u32 LocaliseCodeAddress(u32 num, u32 addr) +{ + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addr) + : ARMJIT_Memory::ClassifyAddress7(addr); + + if (CodeMemRegions[region]) + return ARMJIT_Memory::LocaliseAddress(region, num, addr); + return 0; +} + +TinyVector<u32> InvalidLiterals; + +template <typename T, int ConsoleType> +T SlowRead9(u32 addr, ARMv5* cpu) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (addr < cpu->ITCMSize) + val = *(T*)&cpu->ITCM[addr & 0x7FFF]; + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; + else if (std::is_same<T, u32>::value) + val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr); + else if (std::is_same<T, u16>::value) + val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr); + else + val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T, int ConsoleType> +void SlowWrite9(u32 addr, ARMv5* cpu, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (addr < cpu->ITCMSize) + { + CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); + *(T*)&cpu->ITCM[addr & 0x7FFF] = val; + } + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + { + *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; + } + else if (std::is_same<T, u32>::value) + { + (ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val); + } + else if (std::is_same<T, u16>::value) + { + (ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val); + } + else + { + (ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val); + } +} + +template <typename T, int ConsoleType> +T SlowRead7(u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (std::is_same<T, u32>::value) + val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr); + else if (std::is_same<T, u16>::value) + val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr); + else + val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T, int ConsoleType> +void SlowWrite7(u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (std::is_same<T, u32>::value) + (ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val); + else if (std::is_same<T, u16>::value) + (ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val); + else + (ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val); +} + +template <bool Write, int ConsoleType> +void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + if (Write) + SlowWrite9<u32, ConsoleType>(addr, cpu, data[i]); + else + data[i] = SlowRead9<u32, ConsoleType>(addr, cpu); + addr += 4; + } +} + +template <bool Write, int ConsoleType> +void SlowBlockTransfer7(u32 addr, u64* data, u32 num) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + if (Write) + SlowWrite7<u32, ConsoleType>(addr, data[i]); + else + data[i] = SlowRead7<u32, ConsoleType>(addr); + addr += 4; + } +} + +#define INSTANTIATE_SLOWMEM(consoleType) \ + template void SlowWrite9<u32, consoleType>(u32, ARMv5*, u32); \ + template void SlowWrite9<u16, consoleType>(u32, ARMv5*, u16); \ + template void SlowWrite9<u8, consoleType>(u32, ARMv5*, u8); \ + \ + template u32 SlowRead9<u32, consoleType>(u32, ARMv5*); \ + template u16 SlowRead9<u16, consoleType>(u32, ARMv5*); \ + template u8 SlowRead9<u8, consoleType>(u32, ARMv5*); \ + \ + template void SlowWrite7<u32, consoleType>(u32, u32); \ + template void SlowWrite7<u16, consoleType>(u32, u16); \ + template void SlowWrite7<u8, consoleType>(u32, u8); \ + \ + template u32 SlowRead7<u32, consoleType>(u32); \ + template u16 SlowRead7<u16, consoleType>(u32); \ + template u8 SlowRead7<u8, consoleType>(u32); \ + \ + template void SlowBlockTransfer9<false, consoleType>(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer9<true, consoleType>(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer7<false, consoleType>(u32 addr, u64* data, u32 num); \ + template void SlowBlockTransfer7<true, consoleType>(u32 addr, u64* data, u32 num); \ + +INSTANTIATE_SLOWMEM(0) +INSTANTIATE_SLOWMEM(1) + +template <typename K, typename V, int Size, V InvalidValue> +struct UnreliableHashTable +{ + struct Bucket + { + K KeyA, KeyB; + V ValA, ValB; + }; + + Bucket Table[Size]; + + void Reset() + { + for (int i = 0; i < Size; i++) + { + Table[i].ValA = Table[i].ValB = InvalidValue; + } + } + + UnreliableHashTable() + { + Reset(); + } + + V Insert(K key, V value) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA == value || bucket->ValB == value) + { + return InvalidValue; + } + else if (bucket->ValA == InvalidValue) + { + bucket->KeyA = key; + bucket->ValA = value; + } + else if (bucket->ValB == InvalidValue) + { + bucket->KeyB = key; + bucket->ValB = value; + } + else + { + V prevVal = bucket->ValB; + bucket->KeyB = bucket->KeyA; + bucket->ValB = bucket->ValA; + bucket->KeyA = key; + bucket->ValA = value; + return prevVal; + } + + return InvalidValue; + } + + void Remove(K key) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->KeyA == key && bucket->ValA != InvalidValue) + { + bucket->ValA = InvalidValue; + if (bucket->ValB != InvalidValue) + { + bucket->KeyA = bucket->KeyB; + bucket->ValA = bucket->ValB; + bucket->ValB = InvalidValue; + } + } + if (bucket->KeyB == key && bucket->ValB != InvalidValue) + bucket->ValB = InvalidValue; + } + + V LookUp(K addr) + { + u32 slot = XXH3_64bits(&addr, 4) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA != InvalidValue && bucket->KeyA == addr) + return bucket->ValA; + if (bucket->ValB != InvalidValue && bucket->KeyB == addr) + return bucket->ValB; + + return InvalidValue; + } +}; + +UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates; + +void Init() +{ + JITCompiler = new Compiler(); + + ARMJIT_Memory::Init(); +} + +void DeInit() +{ + ARMJIT_Memory::DeInit(); + + delete JITCompiler; +} + +void Reset() +{ + ResetBlockCache(); + + ARMJIT_Memory::Reset(); +} + +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +{ + for (int j = start; j >= 0; j--) + { + u8 match = instrs[j].Info.WriteFlags & flags; + u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags; + if (matchMaybe) // writes flags maybe + instrs[j].SetFlags |= matchMaybe; + if (match) + { + instrs[j].SetFlags |= match; + flags &= ~match; + if (!flags) + return; + } + } +} + +bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) +{ + if (!thumb) + { + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_LDR_IMM: + case ARMInstrInfo::ak_LDRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_LDRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + default: + break; + } + } + else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); + return true; + } + + JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); + return false; +} + +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } + } + else + { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + JIT_DEBUGPRINT("checking potential idle loop\n"); + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +void NOP(ARM* cpu) {} + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC), + NOP +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +void T_BL_LONG(ARM* cpu) +{ + ARMInterpreter::T_BL_LONG_1(cpu); + cpu->R[15] += 2; + ARMInterpreter::T_BL_LONG_2(cpu); +} + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + T_BL_LONG // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) +{ + bool thumb = cpu->CPSR & 0x20; + + if (Config::JIT_MaxBlockSize < 1) + Config::JIT_MaxBlockSize = 1; + if (Config::JIT_MaxBlockSize > 32) + Config::JIT_MaxBlockSize = 32; + + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; + auto existingBlockIt = map.find(blockAddr); + if (existingBlockIt != map.end()) + { + // there's already a block, though it's not inside the fast map + // could be that there are two blocks at the same physical addr + // but different mirrors + u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal; + + if (localAddr == otherLocalAddr) + { + JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr); + + u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + // some memory has been remapped + JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second); + if (prevBlock) + delete prevBlock; + + map.erase(existingBlockIt); + } + + FetchedInstr instrs[Config::JIT_MaxBlockSize]; + int i = 0; + u32 r15 = cpu->R[15]; + + u32 addressRanges[Config::JIT_MaxBlockSize]; + u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; + u32 numAddressRanges = 0; + + u32 numLiterals = 0; + u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; + // they are going to be hashed + u32 literalValues[Config::JIT_MaxBlockSize]; + u32 instrValues[Config::JIT_MaxBlockSize]; + + cpu->FillPipeline(); + u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; + u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr); + + u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; + + do + { + r15 += thumb ? 2 : 4; + + instrs[i].BranchFlags = 0; + instrs[i].SetFlags = 0; + instrs[i].Instr = nextInstr[0]; + nextInstr[0] = nextInstr[1]; + + instrs[i].Addr = nextInstrAddr[0]; + nextInstrAddr[0] = nextInstrAddr[1]; + nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + instrValues[i] = instrs[i].Instr; + + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); + assert(translatedAddr >> 27); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addressRanges[j] == translatedAddrRounded) + { + std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); + std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); + returning = true; + break; + } + } + if (!returning) + addressRanges[numAddressRanges++] = translatedAddrRounded; + } + addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); + + if (cpu->Num == 0) + { + ARMv5* cpuv5 = (ARMv5*)cpu; + if (thumb && r15 & 0x2) + { + nextInstr[1] >>= 16; + instrs[i].CodeCycles = 0; + } + else + { + nextInstr[1] = cpuv5->CodeRead32(r15, false); + instrs[i].CodeCycles = cpu->CodeCycles; + } + } + else + { + ARMv4* cpuv4 = (ARMv4*)cpu; + if (thumb) + nextInstr[1] = cpuv4->CodeRead16(r15); + else + nextInstr[1] = cpuv4->CodeRead32(r15); + instrs[i].CodeCycles = cpu->CodeCycles; + } + instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (instrs[i].Info.DstRegs & (1 << 14) + || (!thumb + && (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG) + && instrs[i].Instr & (1 << 16))) + hasLink = false; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] + || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop + || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + + u32 literalAddr; + if (Config::JIT_LiteralOptimisations + && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral + && DecodeLiteral(thumb, instrs[i], literalAddr)) + { + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr); + if (!translatedAddr) + { + printf("literal in non executable memory?\n"); + } + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + + u32 j = 0; + for (; j < numAddressRanges; j++) + if (addressRanges[j] == translatedAddrRounded) + break; + if (j == numAddressRanges) + addressRanges[numAddressRanges++] = translatedAddrRounded; + addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); + JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); + cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + literalLoadAddrs[numLiterals++] = translatedAddr; + } + + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 + && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) + { + instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG; + instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16); + instrs[i - 1].Info.DstRegs = 0xC000; + instrs[i - 1].Info.SrcRegs = 0; + instrs[i - 1].Info.EndBlock = true; + i--; + } + + if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + instrs[i].BranchFlags |= branch_StaticTarget; + + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + if (link) + { + lr = linkAddr; + hasLink = true; + } + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; + + bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); + + u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); + u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); + + JitBlock* prevBlock = RestoreCandidates.LookUp(instrHash); + bool mayRestore = true; + if (prevBlock) + { + RestoreCandidates.Remove(instrHash); + + mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash; + + if (mayRestore && prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addressRanges[j] + || prevBlock->AddressMasks()[j] != addressMasks[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } + + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); + block->LiteralHash = literalHash; + block->InstrHash = instrHash; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addressRanges[j]; + for (int j = 0; j < numAddressRanges; j++) + block->AddressMasks()[j] = addressMasks[j]; + for (int j = 0; j < numLiterals; j++) + block->Literals()[j] = literalLoadAddrs[j]; + + block->StartAddr = blockAddr; + block->StartAddrLocal = localAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); + + JIT_DEBUGPRINT("block start %p\n", block->EntryPoint); + } + else + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addressRanges[j] == block->AddressRanges()[j]); + assert(addressMasks[j] == block->AddressMasks()[j]); + assert(addressMasks[j] != 0); + + AddressRange* region = CodeMemRegions[addressRanges[j] >> 27]; + + if (!PageContainsCode(®ion[(addressRanges[j] & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true); + + AddressRange* range = ®ion[(addressRanges[j] & 0x7FFFFFF) / 512]; + range->Code |= addressMasks[j]; + range->Blocks.Add(block); + } + + if (cpu->Num == 0) + JitBlocks9[blockAddr] = block; + else + JitBlocks7[blockAddr] = block; + + u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); +} + +void InvalidateByAddr(u32 localAddr) +{ + JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); + + AddressRange* region = CodeMemRegions[localAddr >> 27]; + AddressRange* range = ®ion[(localAddr & 0x7FFFFFF) / 512]; + u32 mask = 1 << ((localAddr & 0x1FF) / 16); + + range->Code = 0; + for (int i = 0; i < range->Blocks.Length;) + { + JitBlock* block = range->Blocks[i]; + + bool invalidated = false; + u32 mask = 0; + for (int j = 0; j < block->NumAddresses; j++) + { + if (block->AddressRanges()[j] == (localAddr & ~0x1FF)) + { + mask = block->AddressMasks()[j]; + invalidated = block->AddressMasks()[j] & mask; + break; + } + } + assert(mask); + if (!invalidated) + { + range->Code |= mask; + i++; + continue; + } + range->Blocks.Remove(i); + + if (range->Blocks.Length == 0 + && !PageContainsCode(®ion[(localAddr & 0x7FFF000) / 512])) + { + ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false); + } + + bool literalInvalidation = false; + for (int j = 0; j < block->NumLiterals; j++) + { + u32 addr = block->Literals()[j]; + if (addr == localAddr) + { + if (InvalidLiterals.Find(localAddr) != -1) + { + InvalidLiterals.Add(localAddr); + JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); + } + literalInvalidation = true; + break; + } + } + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 512) != (localAddr / 512)) + { + AddressRange* otherRegion = CodeMemRegions[addr >> 27]; + AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512]; + assert(otherRange != range); + + bool removed = otherRange->Blocks.RemoveByValue(block); + assert(removed); + + if (otherRange->Blocks.Length == 0) + { + if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false); + + otherRange->Code = 0; + } + } + } + + FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32; + if (block->Num == 0) + JitBlocks9.erase(block->StartAddr); + else + JitBlocks7.erase(block->StartAddr); + + if (!literalInvalidation) + { + JitBlock* prevBlock = RestoreCandidates.Insert(block->InstrHash, block); + if (prevBlock) + delete prevBlock; + } + else + { + delete block; + } + } +} + +void CheckAndInvalidateITCM() +{ + for (u32 i = 0; i < ITCMPhysicalSize; i+=16) + { + if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16))) + { + InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27)); + } + } +} + +template <u32 num, int region> +void CheckAndInvalidate(u32 addr) +{ + u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr); + if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr); +} + +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) +{ + u64* entry = &entries[offset / 2]; + if (*entry >> 32 == (addr | num)) + return JITCompiler->AddEntryOffset((u32)*entry); + return NULL; +} + +void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry) +{ + u32 localAddr = LocaliseCodeAddress(num, blockAddr); + assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry); +} + +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) +{ + // amazingly ignoring the DTCM is the proper behaviour for code fetches + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(blockAddr) + : ARMJIT_Memory::ClassifyAddress7(blockAddr); + + u32 memoryOffset; + if (FastBlockLookupRegions[region] + && ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size)) + { + //printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset); + entry = FastBlockLookupRegions[region] + memoryOffset / 2; + return true; + } + return false; +} + +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + InvalidLiterals.Clear(); + for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++) + memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2); + RestoreCandidates.Reset(); + for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) + { + if (RestoreCandidates.Table[i].ValA) + { + delete RestoreCandidates.Table[i].ValA; + RestoreCandidates.Table[i].ValA = NULL; + } + if (RestoreCandidates.Table[i].ValA) + { + delete RestoreCandidates.Table[i].ValB; + RestoreCandidates.Table[i].ValB = NULL; + } + } + for (auto it : JitBlocks9) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; + } + delete block; + } + for (auto it : JitBlocks7) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; + } + } + JitBlocks9.clear(); + JitBlocks7.clear(); + + JITCompiler->Reset(); +} + +} diff --git a/src/ARMJIT.h b/src/ARMJIT.h new file mode 100644 index 0000000..04add59 --- /dev/null +++ b/src/ARMJIT.h @@ -0,0 +1,37 @@ +#ifndef ARMJIT_H +#define ARMJIT_H + +#include "types.h" + +#include "ARM.h" +#include "ARM_InstrInfo.h" + +namespace ARMJIT +{ + +typedef void (*JitBlockEntry)(); + +void Init(); +void DeInit(); + +void Reset(); + +void CheckAndInvalidateITCM(); + +void InvalidateByAddr(u32 pseudoPhysical); + +template <u32 num, int region> +void CheckAndInvalidate(u32 addr); + +void CompileBlock(ARM* cpu); + +void ResetBlockCache(); + +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr); +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size); + +} + +extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp new file mode 100644 index 0000000..5f021a0 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp @@ -0,0 +1,930 @@ +#include "ARMJIT_Compiler.h" + +using namespace Arm64Gen; + +namespace ARMJIT +{ + +void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs) +{ + if (!(CurInstr.SetFlags & 0x2)) + S = false; + + CPSRDirty |= S; + + UBFX(W1, rs, 0, 8); + + if (!S) + { + if (op == 3) + RORV(W0, op2.Reg.Rm, W1); + else + { + CMP(W1, 32); + if (op == 2) + { + MOVI2R(W2, 31); + CSEL(W1, W2, W1, CC_GE); + ASRV(W0, op2.Reg.Rm, W1); + } + else + { + if (op == 0) + LSLV(W0, op2.Reg.Rm, W1); + else if (op == 1) + LSRV(W0, op2.Reg.Rm, W1); + CSEL(W0, WZR, W0, CC_GE); + } + } + } + else + { + MOV(W0, op2.Reg.Rm); + FixupBranch zero = CBZ(W1); + + SUB(W1, W1, 1); + if (op == 3) + { + RORV(W0, op2.Reg.Rm, W1); + BFI(RCPSR, W0, 29, 1); + } + else + { + CMP(W1, 31); + if (op == 2) + { + MOVI2R(W2, 31); + CSEL(W1, W2, W1, CC_GT); + ASRV(W0, op2.Reg.Rm, W1); + BFI(RCPSR, W0, 29, 1); + } + else + { + if (op == 0) + { + LSLV(W0, op2.Reg.Rm, W1); + UBFX(W1, W0, 31, 1); + } + else if (op == 1) + LSRV(W0, op2.Reg.Rm, W1); + CSEL(W1, WZR, op ? W0 : W1, CC_GT); + BFI(RCPSR, W1, 29, 1); + CSEL(W0, WZR, W0, CC_GE); + } + } + + MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1)); + SetJumpTarget(zero); + } + op2 = Op2(W0, ST_LSL, 0); +} + +void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp) +{ + if (!(CurInstr.SetFlags & 0x2)) + S = false; + + CPSRDirty |= S; + + switch (op) + { + case 0: // LSL + if (S && amount) + { + UBFX(tmp, op2.Reg.Rm, 32 - amount, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_LSL, amount); + return; + case 1: // LSR + if (S) + { + UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + if (amount == 0) + { + op2 = Op2(0); + return; + } + op2 = Op2(op2.Reg.Rm, ST_LSR, amount); + return; + case 2: // ASR + if (S) + { + UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31); + return; + case 3: // ROR + if (amount == 0) + { + UBFX(tmp, RCPSR, 29, 1); + LSL(tmp, tmp, 31); + if (S) + BFI(RCPSR, op2.Reg.Rm, 29, 1); + ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1)); + + op2 = Op2(tmp, ST_LSL, 0); + } + else + { + if (S) + { + UBFX(tmp, op2.Reg.Rm, amount - 1, 1); + BFI(RCPSR, tmp, 29, 1); + } + op2 = Op2(op2.Reg.Rm, ST_ROR, amount); + } + return; + } +} + +void Compiler::Comp_RetriveFlags(bool retriveCV) +{ + if (CurInstr.SetFlags) + CPSRDirty = true; + + if (CurInstr.SetFlags & 0x4) + { + CSET(W0, CC_EQ); + BFI(RCPSR, W0, 30, 1); + } + if (CurInstr.SetFlags & 0x8) + { + CSET(W0, CC_MI); + BFI(RCPSR, W0, 31, 1); + } + if (retriveCV) + { + if (CurInstr.SetFlags & 0x2) + { + CSET(W0, CC_CS); + BFI(RCPSR, W0, 29, 1); + } + if (CurInstr.SetFlags & 0x1) + { + CSET(W0, CC_VS); + BFI(RCPSR, W0, 28, 1); + } + } +} + +void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2) +{ + if (S && !CurInstr.SetFlags) + S = false; + + switch (op) + { + case 0x0: // AND + if (S) + { + if (op2.IsImm) + ANDSI2R(rd, rn, op2.Imm, W0); + else + ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ANDI2R(rd, rn, op2.Imm, W0); + else + AND(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x1: // EOR + if (op2.IsImm) + EORI2R(rd, rn, op2.Imm, W0); + else + EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + if (S && FlagsNZNeeded()) + TST(rd, rd); + break; + case 0xC: // ORR + if (op2.IsImm) + ORRI2R(rd, rn, op2.Imm, W0); + else + ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + if (S && FlagsNZNeeded()) + TST(rd, rd); + break; + case 0xE: // BIC + if (S) + { + if (op2.IsImm) + ANDSI2R(rd, rn, ~op2.Imm, W0); + else + BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ANDI2R(rd, rn, ~op2.Imm, W0); + else + BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + } + + if (S) + Comp_RetriveFlags(false); +} + +void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2) +{ + if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR) + { + MOV(W0, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W0, ST_LSL, 0); + } + + if (S && !CurInstr.SetFlags) + S = false; + + bool CVInGPR = false; + switch (op) + { + case 0x2: // SUB + if (S) + { + if (op2.IsImm) + SUBSI2R(rd, rn, op2.Imm, W0); + else + SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + { + MOVI2R(W2, op2.Imm); + SUBI2R(rd, rn, op2.Imm, W0); + } + else + SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x3: // RSB + if (op2.IsZero()) + { + op2 = Op2(WZR); + } + else if (op2.IsImm) + { + MOVI2R(W1, op2.Imm); + op2 = Op2(W1); + } + else if (op2.Reg.ShiftAmount != 0) + { + MOV(W1, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W1); + } + + if (S) + SUBS(rd, op2.Reg.Rm, rn); + else + SUB(rd, op2.Reg.Rm, rn); + break; + case 0x4: // ADD + if (S) + { + if (op2.IsImm) + ADDSI2R(rd, rn, op2.Imm, W0); + else + ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + else + { + if (op2.IsImm) + ADDI2R(rd, rn, op2.Imm, W0); + else + ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x5: // ADC + UBFX(W2, RCPSR, 29, 1); + if (S) + { + CVInGPR = true; + ADDS(W1, rn, W2); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + if (op2.IsImm) + ADDSI2R(rd, W1, op2.Imm, W0); + else + ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, rn, W2); + if (op2.IsImm) + ADDI2R(rd, W1, op2.Imm, W0); + else + ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + } + break; + case 0x6: // SBC + UBFX(W2, RCPSR, 29, 1); + // W1 = -op2 - 1 + if (op2.IsImm) + MOVI2R(W1, ~op2.Imm); + else + ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption()); + if (S) + { + CVInGPR = true; + ADDS(W1, W2, W1); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + ADDS(rd, rn, W1); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, W2, W1); + ADD(rd, rn, W1); + } + break; + case 0x7: // RSC + UBFX(W2, RCPSR, 29, 1); + // W1 = -rn - 1 + MVN(W1, rn); + if (S) + { + CVInGPR = true; + ADDS(W1, W2, W1); + CSET(W2, CC_CS); + CSET(W3, CC_VS); + if (op2.IsImm) + ADDSI2R(rd, W1, op2.Imm); + else + ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + CSINC(W2, W2, WZR, CC_CC); + CSINC(W3, W3, WZR, CC_VC); + } + else + { + ADD(W1, W2, W1); + if (op2.IsImm) + ADDI2R(rd, W1, op2.Imm); + else + ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption()); + } + break; + } + + if (S) + { + if (CVInGPR) + { + BFI(RCPSR, W2, 29, 1); + BFI(RCPSR, W3, 28, 1); + } + Comp_RetriveFlags(!CVInGPR); + } +} + +void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2) +{ + if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR) + { + MOV(W0, op2.Reg.Rm, op2.ToArithOption()); + op2 = Op2(W0, ST_LSL, 0); + } + + switch (op) + { + case 0x8: // TST + if (op2.IsImm) + TSTI2R(rn, op2.Imm, W0); + else + ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption()); + break; + case 0x9: // TEQ + if (op2.IsImm) + EORI2R(W0, rn, op2.Imm, W0); + else + EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption()); + TST(W0, W0); + break; + case 0xA: // CMP + if (op2.IsImm) + CMPI2R(rn, op2.Imm, W0); + else + CMP(rn, op2.Reg.Rm, op2.ToArithOption()); + break; + case 0xB: // CMN + if (op2.IsImm) + ADDSI2R(WZR, rn, op2.Imm, W0); + else + CMN(rn, op2.Reg.Rm, op2.ToArithOption()); + break; + } + + Comp_RetriveFlags(op >= 0xA); +} + +// also counts cycles! +void Compiler::A_Comp_GetOp2(bool S, Op2& op2) +{ + if (CurInstr.Instr & (1 << 25)) + { + Comp_AddCycles_C(); + op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E)); + } + else + { + int op = (CurInstr.Instr >> 5) & 0x3; + op2.Reg.Rm = MapReg(CurInstr.A_Reg(0)); + if (CurInstr.Instr & (1 << 4)) + { + Comp_AddCycles_CI(1); + + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + if (CurInstr.A_Reg(0) == 15) + { + ADD(W0, op2.Reg.Rm, 4); + op2.Reg.Rm = W0; + } + Comp_RegShiftReg(op, S, op2, rs); + } + else + { + Comp_AddCycles_C(); + + int amount = (CurInstr.Instr >> 7) & 0x1F; + Comp_RegShiftImm(op, amount, S, op2); + } + } +} + +void Compiler::A_Comp_ALUCmpOp() +{ + u32 op = (CurInstr.Instr >> 21) & 0xF; + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + Op2 op2; + A_Comp_GetOp2(op <= 0x9, op2); + + Comp_Compare(op, rn, op2); +} + +void Compiler::A_Comp_ALUMovOp() +{ + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + Op2 op2; + A_Comp_GetOp2(S, op2); + + if (op == 0xF) // MVN + { + if (op2.IsImm) + { + if (CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm); + MOVI2R(rd, ~op2.Imm); + } + else + ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption()); + } + else // MOV + { + if (op2.IsImm) + { + if (CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm); + MOVI2R(rd, op2.Imm); + } + else + { + // ORR with shifted operand has cycles latency + if (op2.Reg.ShiftAmount > 0) + { + switch (op2.Reg.ShiftType) + { + case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + } + } + else + { + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + } + } + } + + if (S) + { + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + } + + if (CurInstr.Info.Branches()) + Comp_JumpTo(rd, true, S); +} + +void Compiler::A_Comp_ALUTriOp() +{ + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; + bool logical = (1 << op) & 0xF303; + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + Op2 op2; + A_Comp_GetOp2(S && logical, op2); + + if (op2.IsImm && op2.Imm == 0) + op2 = Op2(WZR, ST_LSL, 0); + + if (logical) + Comp_Logical(op, S, rd, rn, op2); + else + Comp_Arithmetic(op, S, rd, rn, op2); + + if (CurInstr.Info.Branches()) + Comp_JumpTo(rd, true, S); +} + +void Compiler::A_Comp_Clz() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + + CLZ(rd, rm); + + assert(Num == 0); +} + +void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn) +{ + if (Num == 0) + { + Comp_AddCycles_CI(S ? 3 : 1); + } + else + { + CLS(W0, rs); + Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3)); + } + + if (mla) + MADD(rd, rm, rs, rn); + else + MUL(rd, rm, rs); + + if (S && FlagsNZNeeded()) + { + TST(rd, rd); + Comp_RetriveFlags(false); + } +} + +void Compiler::A_Comp_Mul_Long() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + bool sign = CurInstr.Instr & (1 << 22); + + if (Num == 0) + { + Comp_AddCycles_CI(S ? 3 : 1); + } + else + { + if (sign) + CLS(W0, rs); + else + CLZ(W0, rs); + Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3)); + } + + if (add) + { + MOV(W0, rn); + BFI(X0, EncodeRegTo64(rd), 32, 32); + if (sign) + SMADDL(EncodeRegTo64(rn), rm, rs, X0); + else + UMADDL(EncodeRegTo64(rn), rm, rs, X0); + if (S && FlagsNZNeeded()) + TST(EncodeRegTo64(rn), EncodeRegTo64(rn)); + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + } + else + { + if (sign) + SMULL(EncodeRegTo64(rn), rm, rs); + else + UMULL(EncodeRegTo64(rn), rm, rs); + if (S && FlagsNZNeeded()) + TST(EncodeRegTo64(rn), EncodeRegTo64(rn)); + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + } + + if (S) + Comp_RetriveFlags(false); +} + +void Compiler::A_Comp_Mul_Short() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool x = CurInstr.Instr & (1 << 5); + bool y = CurInstr.Instr & (1 << 6); + + SBFX(W1, rs, y ? 16 : 0, 16); + + if (op == 0b1000) + { + // SMLAxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(W0, W0, W1); + + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + + Comp_AddCycles_C(); + } + else if (op == 0b1011) + { + // SMULxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(rd, W0, W1); + + Comp_AddCycles_C(); + } + else if (op == 0b1010) + { + // SMLALxy + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + MOV(W2, rn); + BFI(X2, rd, 32, 32); + + SBFX(W0, rm, x ? 16 : 0, 16); + + SMADDL(EncodeRegTo64(rn), W0, W1, X2); + + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + + Comp_AddCycles_CI(1); + } + else if (op == 0b1001) + { + // SMLAWy/SMULWy + SMULL(X0, rm, W1); + ASR(x ? EncodeRegTo64(rd) : X0, X0, 16); + + if (!x) + { + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + } + + Comp_AddCycles_C(); + } +} + +void Compiler::A_Comp_Mul() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + + bool S = CurInstr.Instr & (1 << 20); + bool mla = CurInstr.Instr & (1 << 21); + ARM64Reg rn = INVALID_REG; + if (mla) + rn = MapReg(CurInstr.A_Reg(12)); + + Comp_Mul_Mla(S, mla, rd, rm, rs, rn); +} + +void Compiler::T_Comp_ShiftImm() +{ + Comp_AddCycles_C(); + + u32 op = (CurInstr.Instr >> 11) & 0x3; + int amount = (CurInstr.Instr >> 6) & 0x1F; + + ARM64Reg rd = MapReg(CurInstr.T_Reg(0)); + Op2 op2; + op2.Reg.Rm = MapReg(CurInstr.T_Reg(3)); + Comp_RegShiftImm(op, amount, true, op2); + if (op2.IsImm) + MOVI2R(rd, op2.Imm); + else + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + if (FlagsNZNeeded()) + TST(rd, rd); + + Comp_RetriveFlags(false); +} + +void Compiler::T_Comp_AddSub_() +{ + Comp_AddCycles_C(); + + Op2 op2; + if (CurInstr.Instr & (1 << 10)) + op2 = Op2((CurInstr.Instr >> 6) & 0x7); + else + op2 = Op2(MapReg(CurInstr.T_Reg(6))); + + Comp_Arithmetic( + CurInstr.Instr & (1 << 9) ? 0x2 : 0x4, + true, + MapReg(CurInstr.T_Reg(0)), + MapReg(CurInstr.T_Reg(3)), + op2); +} + +void Compiler::T_Comp_ALUImm8() +{ + Comp_AddCycles_C(); + + u32 imm = CurInstr.Instr & 0xFF; + int op = (CurInstr.Instr >> 11) & 0x3; + + ARM64Reg rd = MapReg(CurInstr.T_Reg(8)); + + switch (op) + { + case 0: + MOVI2R(rd, imm); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + break; + case 1: + Comp_Compare(0xA, rd, Op2(imm)); + break; + case 2: + case 3: + Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm)); + break; + } +} + +void Compiler::T_Comp_ALU() +{ + int op = (CurInstr.Instr >> 6) & 0xF; + ARM64Reg rd = MapReg(CurInstr.T_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.T_Reg(3)); + + if ((op >= 0x2 && op <= 0x4) || op == 0x7) + Comp_AddCycles_CI(1); + else + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: + Comp_Logical(0x0, true, rd, rd, Op2(rs)); + break; + case 0x1: + Comp_Logical(0x1, true, rd, rd, Op2(rs)); + break; + case 0x2: + case 0x3: + case 0x4: + case 0x7: + { + Op2 op2; + op2.Reg.Rm = rd; + Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs); + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + } + break; + case 0x5: + Comp_Arithmetic(0x5, true, rd, rd, Op2(rs)); + break; + case 0x6: + Comp_Arithmetic(0x6, true, rd, rd, Op2(rs)); + break; + case 0x8: + Comp_Compare(0x8, rd, Op2(rs)); + break; + case 0x9: + Comp_Arithmetic(0x3, true, rd, rs, Op2(0)); + break; + case 0xA: + Comp_Compare(0xA, rd, Op2(rs)); + break; + case 0xB: + Comp_Compare(0xB, rd, Op2(rs)); + break; + case 0xC: + Comp_Logical(0xC, true, rd, rd, Op2(rs)); + break; + case 0xD: + Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG); + break; + case 0xE: + Comp_Logical(0xE, true, rd, rd, Op2(rs)); + break; + case 0xF: + MVN(rd, rs); + if (FlagsNZNeeded()) + TST(rd, rd); + Comp_RetriveFlags(false); + break; + } +} + +void Compiler::T_Comp_ALU_HiReg() +{ + u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8)); + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF); + + u32 op = (CurInstr.Instr >> 8) & 0x3; + + Comp_AddCycles_C(); + + switch (op) + { + case 0: + Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs)); + break; + case 1: + Comp_Compare(0xA, rdMapped, rs); + return; + case 2: + MOV(rdMapped, rs); + break; + } + + if (rd == 15) + { + Comp_JumpTo(rdMapped, false, false); + } +} + +void Compiler::T_Comp_AddSP() +{ + Comp_AddCycles_C(); + + ARM64Reg sp = MapReg(13); + u32 offset = (CurInstr.Instr & 0x7F) << 2; + if (CurInstr.Instr & (1 << 7)) + SUB(sp, sp, offset); + else + ADD(sp, sp, offset); +} + +void Compiler::T_Comp_RelAddr() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.T_Reg(8)); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + if (CurInstr.Instr & (1 << 11)) + { + ARM64Reg sp = MapReg(13); + ADD(rd, sp, offset); + } + else + MOVI2R(rd, (R15 & ~2) + offset); +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp new file mode 100644 index 0000000..f130938 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -0,0 +1,421 @@ +#include "ARMJIT_Compiler.h" + +using namespace Arm64Gen; + +// hack +const int kCodeCacheTiming = 3; + +namespace ARMJIT +{ + +template <typename T> +void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR) +{ + cpu->JumpTo(addr, changeCPSR); +} + +void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) +{ + // we can simplify constant branches by a lot + // it's not completely safe to assume stuff like, which instructions to preload + // we'll see how it works out + + IrregularCycles = true; + + u32 newPC; + u32 cycles = 0; + bool setupRegion = false; + + if (addr & 0x1 && !Thumb) + { + CPSRDirty = true; + ORRI2R(RCPSR, RCPSR, 0x20); + } + else if (!(addr & 0x1) && Thumb) + { + CPSRDirty = true; + ANDI2R(RCPSR, RCPSR, ~0x20); + } + + if (Num == 0) + { + ARMv5* cpu9 = (ARMv5*)CurCPU; + + u32 oldregion = R15 >> 24; + u32 newregion = addr >> 24; + + u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; + u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; + cpu9->RegionCodeCycles = regionCodeCycles; + + MOVI2R(W0, regionCodeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles)); + + setupRegion = newregion != oldregion; + if (setupRegion) + cpu9->SetupCodeMem(addr); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // two-opcodes-at-once fetch + // doesn't matter if we put garbage in the MSbs there + if (addr & 0x2) + { + cpu9->CodeRead32(addr-2, true) >> 16; + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+2, false); + cycles += CurCPU->CodeCycles; + } + else + { + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + } + } + else + { + addr &= ~0x3; + newPC = addr+4; + + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+4, false); + cycles += cpu9->CodeCycles; + } + + cpu9->RegionCodeCycles = compileTimeCodeCycles; + if (setupRegion) + cpu9->SetupCodeMem(R15); + } + else + { + ARMv4* cpu7 = (ARMv4*)CurCPU; + + u32 codeRegion = addr >> 24; + u32 codeCycles = addr >> 15; // cheato + + cpu7->CodeRegion = codeRegion; + cpu7->CodeCycles = codeCycles; + + MOVI2R(W0, codeRegion); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion)); + MOVI2R(W0, codeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles)); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // this is necessary because ARM7 bios protection + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; + + CurCPU->R[15] = compileTimePC; + } + else + { + addr &= ~0x3; + newPC = addr+4; + + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; + + CurCPU->R[15] = compileTimePC; + } + + cpu7->CodeRegion = R15 >> 24; + cpu7->CodeCycles = addr >> 15; + } + + if (Exit) + { + MOVI2R(W0, newPC); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15])); + } + if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) + ConstantCycles += cycles; + else + SUB(RCycles, RCycles, cycles); +} + + +void* Compiler::Gen_JumpTo9(int kind) +{ + AlignCode16(); + void* res = GetRXPtr(); + + LSR(W1, W0, 12); + ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2); + LDRB(W1, RCPU, W1); + + LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize)); + + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles)); + + CMP(W1, 0xFF); + MOVI2R(W3, kCodeCacheTiming); + CSEL(W1, W3, W1, CC_EQ); + CMP(W0, W2); + CSINC(W1, W1, WZR, CC_HS); + + FixupBranch switchToThumb; + if (kind == 0) + switchToThumb = TBNZ(W0, 0); + + if (kind == 0 || kind == 1) + { + // ARM + if (kind == 0) + ANDI2R(RCPSR, RCPSR, ~0x20); + + ANDI2R(W0, W0, ~3); + ADD(W0, W0, 4); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); + + ADD(W1, W1, W1); + SUB(RCycles, RCycles, W1); + RET(); + } + + if (kind == 0 || kind == 2) + { + // Thumb + if (kind == 0) + { + SetJumpTarget(switchToThumb); + ORRI2R(RCPSR, RCPSR, 0x20); + } + + ANDI2R(W0, W0, ~1); + ADD(W0, W0, 2); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); + + ADD(W2, W1, W1); + TSTI2R(W0, 0x2); + CSEL(W1, W1, W2, CC_EQ); + SUB(RCycles, RCycles, W1); + RET(); + } + + return res; +} + +void* Compiler::Gen_JumpTo7(int kind) +{ + void* res = GetRXPtr(); + + LSR(W1, W0, 24); + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion)); + LSR(W1, W0, 15); + STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles)); + + MOVP2R(X2, NDS::ARM7MemTimings); + LDR(W3, X2, ArithOption(W1, true)); + + FixupBranch switchToThumb; + if (kind == 0) + switchToThumb = TBNZ(W0, 0); + + if (kind == 0 || kind == 1) + { + UBFX(W2, W3, 0, 8); + UBFX(W3, W3, 8, 8); + ADD(W2, W3, W2); + SUB(RCycles, RCycles, W2); + + ANDI2R(W0, W0, ~3); + + if (kind == 0) + ANDI2R(RCPSR, RCPSR, ~0x20); + + ADD(W3, W0, 4); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + RET(); + } + if (kind == 0 || kind == 2) + { + if (kind == 0) + { + SetJumpTarget(switchToThumb); + + ORRI2R(RCPSR, RCPSR, 0x20); + } + + UBFX(W2, W3, 16, 8); + UBFX(W3, W3, 24, 8); + ADD(W2, W3, W2); + SUB(RCycles, RCycles, W2); + + ANDI2R(W0, W0, ~1); + + ADD(W3, W0, 2); + STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); + + RET(); + } + + return res; +} + +void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR) +{ + IrregularCycles = true; + + if (!restoreCPSR) + { + if (switchThumb) + CPSRDirty = true; + MOV(W0, addr); + BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]); + } + else + { + + bool cpsrDirty = CPSRDirty; + SaveCPSR(); + SaveCycles(); + PushRegs(restoreCPSR); + + if (switchThumb) + MOV(W1, addr); + else + { + if (Thumb) + ORRI2R(W1, addr, 1); + else + ANDI2R(W1, addr, ~1); + } + MOV(X0, RCPU); + MOVI2R(W2, restoreCPSR); + if (Num == 0) + QuickCallFunction(X3, jumpToTrampoline<ARMv5>); + else + QuickCallFunction(X3, jumpToTrampoline<ARMv4>); + + PopRegs(restoreCPSR); + LoadCycles(); + LoadCPSR(); + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; + } +} + +void Compiler::A_Comp_BranchImm() +{ + int op = (CurInstr.Instr >> 24) & 1; + s32 offset = (s32)(CurInstr.Instr << 8) >> 6; + u32 target = R15 + offset; + bool link = op; + + if (CurInstr.Cond() == 0xF) // BLX_imm + { + target += (op << 1) + 1; + link = true; + } + + if (link) + MOVI2R(MapReg(14), R15 - 4); + + Comp_JumpTo(target); +} + +void Compiler::A_Comp_BranchXchangeReg() +{ + ARM64Reg rn = MapReg(CurInstr.A_Reg(0)); + MOV(W0, rn); + if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg + MOVI2R(MapReg(14), R15 - 4); + Comp_JumpTo(W0, true); +} + +void Compiler::T_Comp_BCOND() +{ + u32 cond = (CurInstr.Instr >> 8) & 0xF; + FixupBranch skipExecute = CheckCondition(cond); + + s32 offset = (s32)(CurInstr.Instr << 24) >> 23; + Comp_JumpTo(R15 + offset + 1, true); + + Comp_BranchSpecialBehaviour(true); + + FixupBranch skipFailed = B(); + SetJumpTarget(skipExecute); + Comp_AddCycles_C(true); + + Comp_BranchSpecialBehaviour(false); + + SetJumpTarget(skipFailed); +} + +void Compiler::T_Comp_B() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20; + Comp_JumpTo(R15 + offset + 1); +} + +void Compiler::T_Comp_BranchXchangeReg() +{ + bool link = CurInstr.Instr & (1 << 7); + + if (link) + { + if (Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + MOV(W0, MapReg(CurInstr.A_Reg(3))); + MOVI2R(MapReg(14), R15 - 1); + Comp_JumpTo(W0, true); + } + else + { + ARM64Reg rn = MapReg(CurInstr.A_Reg(3)); + Comp_JumpTo(rn, true); + } +} + +void Compiler::T_Comp_BL_LONG_1() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9; + MOVI2R(MapReg(14), R15 + offset); + Comp_AddCycles_C(); +} + +void Compiler::T_Comp_BL_LONG_2() +{ + ARM64Reg lr = MapReg(14); + s32 offset = (CurInstr.Instr & 0x7FF) << 1; + ADD(W0, lr, offset); + MOVI2R(lr, (R15 - 2) | 1); + Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12))); +} + +void Compiler::T_Comp_BL_Merged() +{ + Comp_AddCycles_C(); + + R15 += 2; + + u32 upperPart = CurInstr.Instr >> 16; + u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9); + target += (upperPart & 0x7FF) << 1; + + if (Num == 1 || upperPart & (1 << 12)) + target |= 1; + + MOVI2R(MapReg(14), (R15 - 2) | 1); + + Comp_JumpTo(target); +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp new file mode 100644 index 0000000..42435ed --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -0,0 +1,884 @@ +#ifdef __SWITCH__ +#include "../switch/compat_switch.h" + +extern char __start__; +#else +#include <sys/mman.h> +#include <unistd.h> +#endif + +#include "ARMJIT_Compiler.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMInterpreter.h" +#include "../Config.h" + +#include <malloc.h> + +using namespace Arm64Gen; + +extern "C" void ARM_Ret(); + +namespace ARMJIT +{ + +/* + + Recompiling classic ARM to ARMv8 code is at the same time + easier and trickier than compiling to a less related architecture + like x64. At one hand you can translate a lot of instructions directly. + But at the same time, there are a ton of exceptions, like for + example ADD and SUB can't have a RORed second operand on ARMv8. + + While writing a JIT when an instruction is recompiled into multiple ones + not to write back until you've read all the other operands! +*/ + +template <> +const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] = + {W19, W20, W21, W22, W23, W24, W25, W26}; +template <> +const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8; + +const int JitMemSize = 16 * 1024 * 1024; +#ifndef __SWITCH__ +u8 JitMem[JitMemSize]; +#endif + +void Compiler::MovePC() +{ + ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); +} + +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + MOV(rd, W3); + } + else + MOV(rd, RCPSR); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + ARM64Reg val; + if (CurInstr.Instr & (1 << 25)) + { + val = W0; + MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))); + } + else + { + val = MapReg(CurInstr.A_Reg(0)); + } + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + + MOVI2R(W1, mask); + MOVI2R(W2, mask & 0xFFFFFF00); + ANDI2R(W5, RCPSR, 0x1F); + CMP(W5, 0x10); + CSEL(W1, W2, W1, CC_EQ); + + BIC(W3, W3, W1); + AND(W0, val, W1); + ORR(W3, W3, W0); + + MOVI2R(W1, 15 - 8); + + BL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + ANDI2R(RCPSR, RCPSR, ~mask); + ANDI2R(W0, val, mask); + ORR(RCPSR, RCPSR, W0); + } + else + { + MOVI2R(W2, mask); + MOVI2R(W3, mask & 0xFFFFFF00); + ANDI2R(W1, RCPSR, 0x1F); + // W1 = first argument + CMP(W1, 0x10); + CSEL(W2, W3, W2, CC_EQ); + + BIC(RCPSR, RCPSR, W2); + AND(W0, val, W2); + ORR(RCPSR, RCPSR, W0); + + MOV(W2, RCPSR); + MOV(X0, RCPU); + + PushRegs(true); + + QuickCallFunction(X3, (void*)&ARM::UpdateMode); + + PopRegs(true); + } + } +} + +void Compiler::PushRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + if (Thumb || CurInstr.Cond() == 0xE) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + RegCache.UnloadRegister(reg); + } + else + { + BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsDirty) + SaveReg(reg, RegCache.Mapping[reg]); + } + } +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } +} + +Compiler::Compiler() +{ +#ifdef __SWITCH__ + JitRWBase = memalign(0x1000, JitMemSize); + + JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000; + JitRWStart = virtmemReserve(JitMemSize); + MemoryInfo info = {0}; + u32 pageInfo = {0}; + int i = 0; + while (JitRXStart != NULL) + { + svcQueryMemory(&info, &pageInfo, (u64)JitRXStart); + if (info.type != MemType_Unmapped) + JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000); + else + break; + if (i++ > 8) + { + printf("couldn't find unmapped place for jit memory\n"); + JitRXStart = NULL; + } + } + + assert(JitRXStart != NULL); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx)); + assert(succeded); + succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize)); + assert(succeded); + + SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); + JitMemMainSize = JitMemSize; +#else + u64 pageSize = sysconf(_SC_PAGE_SIZE); + u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); + u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned; + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + + SetCodeBase(pageAligned, pageAligned); + JitMemUseableSize = alignedSize; +#endif + SetCodePtr(0); + + for (int i = 0; i < 3; i++) + { + JumpToFuncs9[i] = Gen_JumpTo9(i); + JumpToFuncs7[i] = Gen_JumpTo7(i); + } + + /* + W5 - mode + W1 - reg num + W3 - in/out value of reg + */ + { + ReadBanked = GetRXPtr(); + + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); + FixupBranch fiq = B(CC_EQ); + SUBS(W1, W1, 13 - 8); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + FixupBranch notEverything = B(CC_LT); + CMP(W5, 0x12); + FixupBranch irq = B(CC_EQ); + CMP(W5, 0x13); + FixupBranch svc = B(CC_EQ); + CMP(W5, 0x17); + FixupBranch abt = B(CC_EQ); + CMP(W5, 0x1B); + FixupBranch und = B(CC_EQ); + SetJumpTarget(notEverything); + RET(); + + SetJumpTarget(fiq); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ)); + RET(); + SetJumpTarget(irq); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ)); + RET(); + SetJumpTarget(svc); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC)); + RET(); + SetJumpTarget(abt); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT)); + RET(); + SetJumpTarget(und); + LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND)); + RET(); + } + { + WriteBanked = GetRXPtr(); + + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); + FixupBranch fiq = B(CC_EQ); + SUBS(W1, W1, 13 - 8); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + FixupBranch notEverything = B(CC_LT); + CMP(W5, 0x12); + FixupBranch irq = B(CC_EQ); + CMP(W5, 0x13); + FixupBranch svc = B(CC_EQ); + CMP(W5, 0x17); + FixupBranch abt = B(CC_EQ); + CMP(W5, 0x1B); + FixupBranch und = B(CC_EQ); + SetJumpTarget(notEverything); + MOVI2R(W4, 0); + RET(); + + SetJumpTarget(fiq); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(irq); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(svc); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(abt); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT)); + MOVI2R(W4, 1); + RET(); + SetJumpTarget(und); + STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND)); + MOVI2R(W4, 1); + RET(); + } + + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 8; reg++) + { + ARM64Reg rdMapped = (ARM64Reg)(W19 + reg); + PatchedStoreFuncs[num][size][reg] = GetRXPtr(); + if (num == 0) + { + MOV(X1, RCPU); + MOV(W2, rdMapped); + } + else + { + MOV(W1, rdMapped); + } + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowWrite9<u32>); break; + case 33: QuickCallFunction(X3, SlowWrite7<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite9<u16>); break; + case 17: QuickCallFunction(X3, SlowWrite7<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite9<u8>); break; + case 9: QuickCallFunction(X3, SlowWrite7<u8>); break; + } + ABI_PopRegisters({30}); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr(); + if (num == 0) + MOV(X1, RCPU); + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowRead9<u32>); break; + case 33: QuickCallFunction(X3, SlowRead7<u32>); break; + case 16: QuickCallFunction(X3, SlowRead9<u16>); break; + case 17: QuickCallFunction(X3, SlowRead7<u16>); break; + case 8: QuickCallFunction(X3, SlowRead9<u8>); break; + case 9: QuickCallFunction(X3, SlowRead7<u8>); break; + } + ABI_PopRegisters({30}); + if (size == 32) + MOV(rdMapped, W0); + else if (signextend) + SBFX(rdMapped, W0, 0, 8 << size); + else + UBFX(rdMapped, W0, 0, 8 << size); + RET(); + } + } + } + } + + FlushIcache(); + + JitMemSecondarySize = 1024*1024*4; + + JitMemMainSize -= GetCodeOffset(); + JitMemMainSize -= JitMemSecondarySize; + + SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr()); +} + +Compiler::~Compiler() +{ +#ifdef __SWITCH__ + if (JitRWStart != NULL) + { + bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize)); + assert(succeded); + virtmemFree(JitRWStart, JitMemSize); + succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize)); + assert(succeded); + free(JitRWBase); + } +#endif +} + +void Compiler::LoadCycles() +{ + LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + +void Compiler::SaveCycles() +{ + STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + +void Compiler::LoadReg(int reg, ARM64Reg nativeReg) +{ + if (reg == 15) + MOVI2R(nativeReg, R15); + else + LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg])); +} + +void Compiler::SaveReg(int reg, ARM64Reg nativeReg) +{ + STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg])); +} + +void Compiler::LoadCPSR() +{ + assert(!CPSRDirty); + LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR)); +} + +void Compiler::SaveCPSR(bool markClean) +{ + if (CPSRDirty) + { + STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR)); + CPSRDirty = CPSRDirty && !markClean; + } +} + +FixupBranch Compiler::CheckCondition(u32 cond) +{ + if (cond >= 0x8) + { + LSR(W1, RCPSR, 28); + MOVI2R(W2, 1); + LSLV(W2, W2, W1); + ANDI2R(W2, W2, ARM::ConditionTable[cond], W3); + + return CBZ(W2); + } + else + { + u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))); + + if (cond & 1) + return TBNZ(RCPSR, bit); + else + return TBZ(RCPSR, bit); + } +} + +#define F(x) &Compiler::A_Comp_##x +const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = +{ + // AND + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // EOR + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // SUB + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // RSB + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ADD + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ADC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // SBC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // RSC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // ORR + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // MOV + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + // BIC + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), + // MVN + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), + // TST + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // TEQ + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // CMP + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // CMN + F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), + // Mul + F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), + // ARMv5 exclusives + F(Clz), NULL, NULL, NULL, NULL, + + // STR + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // STRB + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // LDR + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // LDRB + F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), + // STRH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRD + NULL, NULL, NULL, NULL, + // STRD + NULL, NULL, NULL, NULL, + // LDRH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRSB + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // LDRSH + F(MemHD), F(MemHD), F(MemHD), F(MemHD), + // Swap + NULL, NULL, + // LDM, STM + F(LDM_STM), F(LDM_STM), + // Branch + F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg), + // Special + NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL, + &Compiler::Nop +}; +#undef F +#define F(x) &Compiler::T_Comp_##x +const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = +{ + // Shift imm + F(ShiftImm), F(ShiftImm), F(ShiftImm), + // Add/sub tri operand + F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_), + // 8 bit imm + F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8), + // ALU + F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), + F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), + // ALU hi reg + F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg), + // PC/SP relative ops + F(RelAddr), F(RelAddr), F(AddSP), + // LDR PC rel + F(LoadPCRel), + // LDR/STR reg offset + F(MemReg), F(MemReg), F(MemReg), F(MemReg), + // LDR/STR sign extended, half + F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), + // LDR/STR imm offset + F(MemImm), F(MemImm), F(MemImm), F(MemImm), + // LDR/STR half imm offset + F(MemImmHalf), F(MemImmHalf), + // LDR/STR sp rel + F(MemSPRel), F(MemSPRel), + // PUSH/POP + F(PUSH_POP), F(PUSH_POP), + // LDMIA, STMIA + F(LDMIA_STMIA), F(LDMIA_STMIA), + // Branch + F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2), + // Unk, SVC + NULL, NULL, + F(BL_Merged) +}; + +bool Compiler::CanCompile(bool thumb, u16 kind) +{ + return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; +} + +void Compiler::Comp_BranchSpecialBehaviour(bool taken) +{ + if (taken && CurInstr.BranchFlags & branch_IdleBranch) + { + MOVI2R(W0, 1); + STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop)); + } + + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) + { + RegCache.PrepareExit(); + + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +{ + if (JitMemMainSize - GetCodeOffset() < 1024 * 16) + { + printf("JIT near memory full, resetting...\n"); + ResetBlockCache(); + } + if ((JitMemMainSize + JitMemSecondarySize) - OtherCodeRegion < 1024 * 8) + { + printf("JIT far memory full, resetting...\n"); + ResetBlockCache(); + } + + JitBlockEntry res = (JitBlockEntry)GetRXPtr(); + + Thumb = thumb; + Num = cpu->Num; + CurCPU = cpu; + ConstantCycles = 0; + RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true); + CPSRDirty = false; + + for (int i = 0; i < instrsCount; i++) + { + CurInstr = instrs[i]; + R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; + + CompileFunc comp = Thumb + ? T_Comp[CurInstr.Info.Kind] + : A_Comp[CurInstr.Info.Kind]; + + Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + + //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags); + + bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + { + MOVI2R(W0, R15); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15])); + if (comp == NULL) + { + MOVI2R(W0, CurInstr.Instr); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr)); + } + if (Num == 0) + { + MOVI2R(W0, (s32)CurInstr.CodeCycles); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles)); + } + } + + if (comp == NULL) + { + SaveCycles(); + SaveCPSR(); + RegCache.Flush(); + } + else + RegCache.Prepare(Thumb, i); + + if (Thumb) + { + if (comp == NULL) + { + MOV(X0, RCPU); + QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]); + } + else + (this->*comp)(); + } + else + { + u32 cond = CurInstr.Cond(); + if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + if (comp) + (this->*comp)(); + else + { + MOV(X0, RCPU); + QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM); + } + } + else if (cond == 0xF) + Comp_AddCycles_C(); + else + { + IrregularCycles = false; + + FixupBranch skipExecute; + if (cond < 0xE) + skipExecute = CheckCondition(cond); + + if (comp == NULL) + { + MOV(X0, RCPU); + QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]); + } + else + { + (this->*comp)(); + } + + Comp_BranchSpecialBehaviour(true); + + if (cond < 0xE) + { + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) + { + FixupBranch skipNop = B(); + SetJumpTarget(skipExecute); + + Comp_AddCycles_C(); + + Comp_BranchSpecialBehaviour(false); + + SetJumpTarget(skipNop); + } + else + SetJumpTarget(skipExecute); + } + + } + } + + if (comp == NULL) + { + LoadCycles(); + LoadCPSR(); + } + } + + RegCache.Flush(); + + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); + + FlushIcache(); + + return res; +} + +void Compiler::Reset() +{ + LoadStorePatches.clear(); + + SetCodePtr(0); + OtherCodeRegion = JitMemMainSize; + + const u32 brk_0 = 0xD4200000; + + for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++) + *(((u32*)GetRWPtr()) + i) = brk_0; +} + +void Compiler::Comp_AddCycles_C(bool forceNonConstant) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + if (forceNonConstant) + ConstantCycles += cycles; + else + SUB(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CI(u32 numI) +{ + IrregularCycles = true; + + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI; + + if (Thumb || CurInstr.Cond() == 0xE) + ConstantCycles += cycles; + else + SUB(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift) +{ + IrregularCycles = true; + + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c; + + SUB(RCycles, RCycles, cycles); + if (Thumb || CurInstr.Cond() >= 0xE) + ConstantCycles += cycles; + else + SUB(RCycles, RCycles, cycles); +} + +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + SUB(RCycles, RCycles, cycles); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if ((CurInstr.DataRegion >> 24) == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles) + SUB(RCycles, RCycles, cycles); + else + ConstantCycles += cycles; +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h new file mode 100644 index 0000000..e4ffc63 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -0,0 +1,269 @@ +#ifndef ARMJIT_COMPILER_H +#define ARMJIT_COMPILER_H + +#include "../ARM.h" +#include "../ARMJIT.h" + +#include "../dolphin/Arm64Emitter.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMJIT_RegisterCache.h" + +#include <unordered_map> + +namespace ARMJIT +{ + +const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27; +const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28; +const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29; + +struct Op2 +{ + Op2() + {} + + Op2(Arm64Gen::ARM64Reg rm) : IsImm(false) + { + Reg.Rm = rm; + Reg.ShiftType = Arm64Gen::ST_LSL; + Reg.ShiftAmount = 0; + } + + Op2(u32 imm) : IsImm(true), Imm(imm) + {} + + Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false) + { + Reg.Rm = rm; + Reg.ShiftType = st; + Reg.ShiftAmount = amount; + } + + Arm64Gen::ArithOption ToArithOption() + { + assert(!IsImm); + return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount); + } + + bool IsSimpleReg() + { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; } + bool ImmFits12Bit() + { return IsImm && (Imm & 0xFFF == Imm); } + bool IsZero() + { return IsImm && !Imm; } + + bool IsImm; + union + { + struct + { + Arm64Gen::ARM64Reg Rm; + Arm64Gen::ShiftType ShiftType; + int ShiftAmount; + } Reg; + u32 Imm; + }; +}; + +struct LoadStorePatch +{ + void* PatchFunc; + s32 PatchOffset; + u32 PatchSize; +}; + +class Compiler : public Arm64Gen::ARM64XEmitter +{ +public: + typedef void (Compiler::*CompileFunc)(); + + Compiler(); + ~Compiler(); + + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + + Arm64Gen::ARM64Reg MapReg(int reg) + { + assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG); + return RegCache.Mapping[reg]; + } + + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + + bool CanCompile(bool thumb, u16 kind); + + bool FlagsNZNeeded() + { + return CurInstr.SetFlags & 0xC; + } + + void Reset(); + + void Comp_AddCycles_C(bool forceNonConstant = false); + void Comp_AddCycles_CI(u32 numI); + void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift); + void Comp_AddCycles_CD(); + void Comp_AddCycles_CDI(); + + void MovePC(); + + void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg); + void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg); + + void LoadCPSR(); + void SaveCPSR(bool markClean = true); + + void LoadCycles(); + void SaveCycles(); + + void Nop() {} + + void A_Comp_ALUTriOp(); + void A_Comp_ALUMovOp(); + void A_Comp_ALUCmpOp(); + + void A_Comp_Mul(); + void A_Comp_Mul_Long(); + void A_Comp_Mul_Short(); + + void A_Comp_Clz(); + + void A_Comp_MemWB(); + void A_Comp_MemHD(); + + void A_Comp_LDM_STM(); + + void A_Comp_BranchImm(); + void A_Comp_BranchXchangeReg(); + + void A_Comp_MRS(); + void A_Comp_MSR(); + + void T_Comp_ShiftImm(); + void T_Comp_AddSub_(); + void T_Comp_ALUImm8(); + void T_Comp_ALU(); + void T_Comp_ALU_HiReg(); + void T_Comp_AddSP(); + void T_Comp_RelAddr(); + + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void T_Comp_MemRegHalf(); + void T_Comp_MemImmHalf(); + void T_Comp_LoadPCRel(); + void T_Comp_MemSPRel(); + + void T_Comp_LDMIA_STMIA(); + void T_Comp_PUSH_POP(); + + void T_Comp_BCOND(); + void T_Comp_B(); + void T_Comp_BranchXchangeReg(); + void T_Comp_BL_LONG_1(); + void T_Comp_BL_LONG_2(); + void T_Comp_BL_Merged(); + + s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + + void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn); + + void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2); + void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2); + void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2); + + void Comp_RetriveFlags(bool retriveCV); + + Arm64Gen::FixupBranch CheckCondition(u32 cond); + + void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false); + void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); + + void A_Comp_GetOp2(bool S, Op2& op2); + + void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0); + void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); + + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags); + + // 0 = switch mode, 1 = stay arm, 2 = stay thumb + void* Gen_JumpTo9(int kind); + void* Gen_JumpTo7(int kind); + + void Comp_BranchSpecialBehaviour(bool taken); + + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(GetRXBase() + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - GetRXBase(); + } + + bool IsJITFault(u64 pc); + s64 RewriteMemAccess(u64 pc); + + void SwapCodeRegion() + { + ptrdiff_t offset = GetCodeOffset(); + SetCodePtrUnsafe(OtherCodeRegion); + OtherCodeRegion = offset; + } + + ptrdiff_t OtherCodeRegion; + + bool Exit; + + FetchedInstr CurInstr; + bool Thumb; + u32 R15; + u32 Num; + ARM* CurCPU; + u32 ConstantCycles; + u32 CodeRegion; + + BitSet32 SavedRegs; + + u32 JitMemSecondarySize; + u32 JitMemMainSize; + + void* ReadBanked, *WriteBanked; + + void* JumpToFuncs9[3]; + void* JumpToFuncs7[3]; + + std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches; + + // [Num][Size][Sign Extend][Output register] + void* PatchedLoadFuncs[2][3][2][8]; + void* PatchedStoreFuncs[2][3][8]; + + RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache; + + bool CPSRDirty = false; + + bool IrregularCycles = false; + +#ifdef __SWITCH__ + void* JitRWBase; + void* JitRWStart; + void* JitRXStart; +#endif +}; + +} + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s new file mode 100644 index 0000000..536a478 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Linkage.s @@ -0,0 +1,68 @@ +#include "../ARMJIT_x64/ARMJIT_Offsets.h" + +.text + +#define RCPSR W27 +#define RCycles W28 +#define RCPU X29 + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: + stp x19, x20, [sp, #-96]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + mov RCPU, x0 + ldr RCycles, [RCPU, ARM_Cycles_offset] + ldr RCPSR, [RCPU, ARM_CPSR_offset] + + br x1 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + str RCycles, [RCPU, ARM_Cycles_offset] + str RCPSR, [RCPU, ARM_CPSR_offset] + + ldp x29, x30, [sp, #80] + ldp x27, x28, [sp, #64] + ldp x25, x26, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #96 + + ret + +.p2align 4,,15 + +.global ARM_RestoreContext +ARM_RestoreContext: + mov sp, x0 + + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + ldp x16, x17, [sp, #128] + ldp x18, x19, [sp, #144] + ldp x20, x21, [sp, #160] + ldp x22, x23, [sp, #176] + ldp x24, x25, [sp, #192] + ldp x26, x27, [sp, #208] + ldp x28, x29, [sp, #224] + ldr x30, [sp, #240] + + ldp x17, x18, [sp, #248] + mov sp, x17 + + br x18
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp new file mode 100644 index 0000000..c1b23a7 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -0,0 +1,794 @@ +#include "ARMJIT_Compiler.h" + +#include "../Config.h" + +#include "../ARMJIT_Memory.h" + +using namespace Arm64Gen; + +namespace ARMJIT +{ + +bool Compiler::IsJITFault(u64 pc) +{ + return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); +} + +s64 Compiler::RewriteMemAccess(u64 pc) +{ + ptrdiff_t pcOffset = pc - (u64)GetRXBase(); + + auto it = LoadStorePatches.find(pcOffset); + + if (it != LoadStorePatches.end()) + { + LoadStorePatch patch = it->second; + + ptrdiff_t curCodeOffset = GetCodeOffset(); + + SetCodePtrUnsafe(pcOffset + patch.PatchOffset); + + BL(patch.PatchFunc); + + for (int i = 0; i < patch.PatchSize / 4 - 1; i++) + HINT(HINT_NOP); + + FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr()); + + SetCodePtrUnsafe(curCodeOffset); + + LoadStorePatches.erase(it); + + return patch.PatchOffset; + } + printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc)); + assert(false); +} + +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) +{ + u32 localAddr = LocaliseCodeAddress(Num, addr); + + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); + if (invalidLiteralIdx != -1) + { + InvalidLiterals.Remove(invalidLiteralIdx); + return false; + } + + Comp_AddCycles_CDI(); + + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + { + CurCPU->DataRead16(addr & ~0x1, &val); + if (signExtend) + val = ((s32)val << 16) >> 16; + } + else + { + CurCPU->DataRead8(addr, &val); + if (signExtend) + val = ((s32)val << 24) >> 24; + } + CurCPU->R[15] = tmpR15; + + MOVI2R(MapReg(rd), val); + + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + return true; +} + +void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) +{ + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) + return; + } + + if (flags & memop_Store) + Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); + + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rnMapped = MapReg(rn); + + if (Thumb && rn == 15) + { + ANDI2R(W3, rnMapped, ~2); + rnMapped = W3; + } + + ARM64Reg finalAddr = W0; + if (flags & memop_Post) + { + finalAddr = rnMapped; + MOV(W0, rnMapped); + } + + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (!offset.IsImm) + Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); + // offset might has become an immediate + if (offset.IsImm) + { + if (offset.Imm) + { + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Imm); + else + ADD(finalAddr, rnMapped, offset.Imm); + } + else if (finalAddr != rnMapped) + MOV(finalAddr, rnMapped); + } + else + { + if (offset.Reg.ShiftType == ST_ROR) + { + ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); + offset = Op2(W0); + } + + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + else + ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + } + + if (!(flags & memop_Post) && (flags & memop_Writeback)) + MOV(rnMapped, W0); + + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); + + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) + { + ptrdiff_t memopStart = GetCodeOffset(); + LoadStorePatch patch; + + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19] + : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19]; + assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8); + + MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + + // take a chance at fastmem + if (size > 8) + ANDI2R(W1, W0, addressMask); + + ptrdiff_t loadStorePosition = GetCodeOffset(); + if (flags & memop_Store) + { + STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7); + } + else + { + LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); + if (size == 32) + { + UBFIZ(W0, W0, 3, 2); + RORV(rdMapped, rdMapped, W0); + } + } + + patch.PatchOffset = memopStart - loadStorePosition; + patch.PatchSize = GetCodeOffset() - memopStart; + LoadStorePatches[loadStorePosition] = patch; + } + else + { + void* func = NULL; + if (addrIsStatic) + func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size); + + if (func) + { + if (flags & memop_Store) + MOV(W1, rdMapped); + QuickCallFunction(X2, (void (*)())func); + + if (!(flags & memop_Store)) + { + if (size == 32) + { + if (staticAddress & 0x3) + ROR_(rdMapped, W0, (staticAddress & 0x3) << 3); + else + MOV(rdMapped, W0); + } + else + { + if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); + } + } + } + else + { + if (Num == 0) + { + MOV(X1, RCPU); + if (flags & memop_Store) + { + MOV(W2, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite9<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite9<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite9<u8>); break; + } + } + else + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead9<u32>); break; + case 16: QuickCallFunction(X3, SlowRead9<u16>); break; + case 8: QuickCallFunction(X3, SlowRead9<u8>); break; + } + } + } + else + { + if (flags & memop_Store) + { + MOV(W1, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite7<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite7<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite7<u8>); break; + } + } + else + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead7<u32>); break; + case 16: QuickCallFunction(X3, SlowRead7<u16>); break; + case 8: QuickCallFunction(X3, SlowRead7<u8>); break; + } + } + } + + if (!(flags & memop_Store)) + { + if (size == 32) + MOV(rdMapped, W0); + else if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); + } + } + } + + if (CurInstr.Info.Branches()) + { + if (size < 32) + printf("LDR size < 32 branching?\n"); + Comp_JumpTo(rdMapped, Num == 0, false); + } +} + +void Compiler::A_Comp_MemWB() +{ + Op2 offset; + if (CurInstr.Instr & (1 << 25)) + offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F); + else + offset = Op2(CurInstr.Instr & 0xFFF); + + bool load = CurInstr.Instr & (1 << 20); + bool byte = CurInstr.Instr & (1 << 22); + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags); +} + +void Compiler::A_Comp_MemHD() +{ + bool load = CurInstr.Instr & (1 << 20); + bool signExtend; + int op = (CurInstr.Instr >> 5) & 0x3; + int size; + + if (load) + { + signExtend = op >= 2; + size = op == 2 ? 8 : 16; + } + else + { + size = 16; + signExtend = false; + } + + Op2 offset; + if (CurInstr.Instr & (1 << 22)) + offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0)); + else + offset = Op2(MapReg(CurInstr.A_Reg(0))); + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); +} + +void Compiler::T_Comp_MemReg() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + bool byte = op & 0x1; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), + Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::T_Comp_MemImm() +{ + int op = (CurInstr.Instr >> 11) & 0x3; + bool load = op & 0x1; + bool byte = op & 0x2; + u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), + byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::T_Comp_MemRegHalf() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op != 0; + int size = op != 1 ? 16 : 8; + bool signExtend = op & 1; + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))), + size, flags); +} + +void Compiler::T_Comp_MemImmHalf() +{ + u32 offset = (CurInstr.Instr >> 5) & 0x3E; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16, + load ? 0 : memop_Store); +} + +void Compiler::T_Comp_LoadPCRel() +{ + u32 offset = ((CurInstr.Instr & 0xFF) << 2); + u32 addr = (R15 & ~0x2) + offset; + + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); +} + +void Compiler::T_Comp_MemSPRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) * 4; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store); +} + +s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + IrregularCycles = true; + + int regsCount = regs.Count(); + + if (regsCount == 0) + return 0; // actually not the right behaviour TODO: fix me + + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) + { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + + if (store) + Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); + + int expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + bool compileFastPath = Config::JIT_FastMemory + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); + + if (decrement) + { + SUB(W0, MapReg(rn), regsCount * 4); + ANDI2R(W0, W0, ~3); + preinc ^= true; + } + else + { + ANDI2R(W0, MapReg(rn), ~3); + } + + LoadStorePatch patch; + if (compileFastPath) + { + ptrdiff_t fastPathStart = GetCodeOffset(); + ptrdiff_t firstLoadStoreOffset; + + bool firstLoadStore = true; + + MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + ADD(X1, X1, X0); + + u32 offset = preinc ? 4 : 0; + BitSet16::Iterator it = regs.begin(); + + if (regsCount & 1) + { + int reg = *it; + it++; + + ARM64Reg first = W3; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STR(INDEX_UNSIGNED, first, X1, offset); + else + LDR(INDEX_UNSIGNED, first, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + + offset += 4; + } + + while (it != regs.end()) + { + int reg = *it; + it++; + int nextReg = *it; + it++; + + ARM64Reg first = W3, second = W4; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + if (RegCache.LoadedRegs & (1 << nextReg)) + second = MapReg(nextReg); + else if (store) + LoadReg(nextReg, second); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STP(INDEX_SIGNED, first, second, X1, offset); + else + LDP(INDEX_SIGNED, first, second, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store) + SaveReg(nextReg, second); + + offset += 8; + } + + patch.PatchSize = GetCodeOffset() - fastPathStart; + patch.PatchOffset = fastPathStart - firstLoadStoreOffset; + SwapCodeRegion(); + patch.PatchFunc = GetRXPtr(); + + LoadStorePatches[firstLoadStoreOffset] = patch; + + ABI_PushRegisters({30}); + } + + int i = 0; + + SUB(SP, SP, ((regsCount + 1) & ~1) * 8); + if (store) + { + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W5, RCPSR, 0, 5); + + BitSet16::Iterator it = regs.begin(); + while (it != regs.end()) + { + BitSet16::Iterator nextReg = it; + nextReg++; + + int reg = *it; + + if (usermode && reg >= 8 && reg < 15) + { + if (RegCache.LoadedRegs & (1 << reg)) + MOV(W3, MapReg(reg)); + else + LoadReg(reg, W3); + MOVI2R(W1, reg - 8); + BL(ReadBanked); + STR(INDEX_UNSIGNED, W3, SP, i * 8); + } + else if (!usermode && nextReg != regs.end()) + { + ARM64Reg first = W3, second = W4; + + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else + LoadReg(reg, W3); + + if (RegCache.LoadedRegs & (1 << *nextReg)) + second = MapReg(*nextReg); + else + LoadReg(*nextReg, W4); + + STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); + + i++; + it++; + } + else if (RegCache.LoadedRegs & (1 << reg)) + { + STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8); + } + else + { + LoadReg(reg, W3); + STR(INDEX_UNSIGNED, W3, SP, i * 8); + } + i++; + it++; + } + } + + ADD(X1, SP, 0); + MOVI2R(W2, regsCount); + + if (Num == 0) + { + MOV(X3, RCPU); + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break; + } + } + else + { + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break; + } + } + + if (!store) + { + if (usermode && !regs[15] && (regs & BitSet16(0x7f00))) + UBFX(W5, RCPSR, 0, 5); + + BitSet16::Iterator it = regs.begin(); + while (it != regs.end()) + { + BitSet16::Iterator nextReg = it; + nextReg++; + + int reg = *it; + + if (usermode && !regs[15] && reg >= 8 && reg < 15) + { + LDR(INDEX_UNSIGNED, W3, SP, i * 8); + MOVI2R(W1, reg - 8); + BL(WriteBanked); + FixupBranch alreadyWritten = CBNZ(W4); + if (RegCache.LoadedRegs & (1 << reg)) + MOV(MapReg(reg), W3); + else + SaveReg(reg, W3); + SetJumpTarget(alreadyWritten); + } + else if (!usermode && nextReg != regs.end()) + { + ARM64Reg first = W3, second = W4; + + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + if (RegCache.LoadedRegs & (1 << *nextReg)) + second = MapReg(*nextReg); + + LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); + + if (first == W3) + SaveReg(reg, W3); + if (second == W4) + SaveReg(*nextReg, W4); + + it++; + i++; + } + else if (RegCache.LoadedRegs & (1 << reg)) + { + ARM64Reg mapped = MapReg(reg); + LDR(INDEX_UNSIGNED, mapped, SP, i * 8); + } + else + { + LDR(INDEX_UNSIGNED, W3, SP, i * 8); + SaveReg(reg, W3); + } + + it++; + i++; + } + } + ADD(SP, SP, ((regsCount + 1) & ~1) * 8); + + if (compileFastPath) + { + ABI_PopRegisters({30}); + RET(); + + FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr()); + SwapCodeRegion(); + } + + if (!store && regs[15]) + { + ARM64Reg mapped = MapReg(15); + Comp_JumpTo(mapped, Num == 0, usermode); + } + + return regsCount * 4 * (decrement ? -1 : 1); +} + +void Compiler::A_Comp_LDM_STM() +{ + BitSet16 regs(CurInstr.Instr & 0xFFFF); + + bool load = CurInstr.Instr & (1 << 20); + bool pre = CurInstr.Instr & (1 << 24); + bool add = CurInstr.Instr & (1 << 23); + bool writeback = CurInstr.Instr & (1 << 21); + bool usermode = CurInstr.Instr & (1 << 22); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(16)); + + s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode); + + if (load && writeback && regs[CurInstr.A_Reg(16)]) + writeback = Num == 0 + ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1)) + : false; + if (writeback) + { + if (offset > 0) + ADD(rn, rn, offset); + else + SUB(rn, rn, -offset); + } +} + +void Compiler::T_Comp_PUSH_POP() +{ + bool load = CurInstr.Instr & (1 << 11); + BitSet16 regs(CurInstr.Instr & 0xFF); + if (CurInstr.Instr & (1 << 8)) + { + if (load) + regs[15] = true; + else + regs[14] = true; + } + + ARM64Reg sp = MapReg(13); + s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false); + + if (offset > 0) + ADD(sp, sp, offset); + else + SUB(sp, sp, -offset); +} + +void Compiler::T_Comp_LDMIA_STMIA() +{ + BitSet16 regs(CurInstr.Instr & 0xFF); + ARM64Reg rb = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + u32 regsCount = regs.Count(); + + s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false); + + if (!load || !regs[CurInstr.T_Reg(8)]) + { + if (offset > 0) + ADD(rb, rb, offset); + else + SUB(rb, rb, -offset); + } +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h new file mode 100644 index 0000000..513c103 --- /dev/null +++ b/src/ARMJIT_Compiler.h @@ -0,0 +1,12 @@ +#if defined(__x86_64__) +#include "ARMJIT_x64/ARMJIT_Compiler.h" +#elif defined(__aarch64__) +#include "ARMJIT_A64/ARMJIT_Compiler.h" +#else +#error "The current target platform doesn't have a JIT backend" +#endif + +namespace ARMJIT +{ +extern Compiler* JITCompiler; +}
\ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..c87e1b3 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,227 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include <stdint.h> +#include <string.h> +#include <assert.h> + +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2, + branch_StaticTarget = 1 << 3, +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 Addr; + + u8 DataCycles; + u16 CodeCycles; + u32 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template <typename T> +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u16 Length = 0; + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(T) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void SetLength(u16 length) + { + if (Capacity < length) + MakeCapacity(length); + + Length = length; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals) + { + Num = num; + NumAddresses = numAddresses; + NumLiterals = numLiterals; + Data.SetLength(numAddresses * 2 + numLiterals); + } + + u32 StartAddr; + u32 StartAddrLocal; + u32 InstrHash, LiteralHash; + u8 Num; + u16 NumAddresses; + u16 NumLiterals; + + JitBlockEntry EntryPoint; + + u32* AddressRanges() + { return &Data[0]; } + u32* AddressMasks() + { return &Data[NumAddresses]; } + u32* Literals() + { return &Data[NumAddresses * 2]; } + +private: + TinyVector<u32> Data; +}; + +// size should be 16 bytes because I'm to lazy to use mul and whatnot +struct __attribute__((packed)) AddressRange +{ + TinyVector<JitBlock*> Blocks; + u32 Code; +}; + + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +extern TinyVector<u32> InvalidLiterals; + +extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count]; + +inline bool PageContainsCode(AddressRange* range) +{ + for (int i = 0; i < 8; i++) + { + if (range[i].Blocks.Length > 0) + return true; + } + return false; +} + +u32 LocaliseCodeAddress(u32 num, u32 addr); + +template <u32 Num> +void LinkBlock(ARM* cpu, u32 codeOffset); + +template <typename T, int ConsoleType> T SlowRead9(u32 addr, ARMv5* cpu); +template <typename T, int ConsoleType> void SlowWrite9(u32 addr, ARMv5* cpu, T val); +template <typename T, int ConsoleType> T SlowRead7(u32 addr); +template <typename T, int ConsoleType> void SlowWrite7(u32 addr, T val); + +template <bool Write, int ConsoleType> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template <bool Write, int ConsoleType> void SlowBlockTransfer7(u32 addr, u64* data, u32 num); + +} + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp new file mode 100644 index 0000000..0276c65 --- /dev/null +++ b/src/ARMJIT_Memory.cpp @@ -0,0 +1,1072 @@ +#if defined(__SWITCH__) +#include "switch/compat_switch.h" +#elif defined(_WIN32) +#include <windows.h> +#endif + +#include "ARMJIT_Memory.h" + +#include "ARMJIT_Internal.h" +#include "ARMJIT_Compiler.h" + +#include "DSi.h" +#include "GPU.h" +#include "GPU3D.h" +#include "Wifi.h" +#include "NDSCart.h" +#include "SPU.h" + +#include <malloc.h> + +/* + We're handling fastmem here. + + Basically we're repurposing a big piece of virtual memory + and map the memory regions as they're structured on the DS + in it. + + On most systems you have a single piece of main ram, + maybe some video ram and faster cache RAM and that's about it. + Here we have not only a lot more different memory regions, + but also two address spaces. Not only that but they all have + mirrors (the worst case is 16kb SWRAM which is mirrored 1024x). + + We handle this by only mapping those regions which are actually + used and by praying the games don't go wild. + + Beware, this file is full of platform specific code. + +*/ + +namespace ARMJIT_Memory +{ +struct FaultDescription +{ + u32 EmulatedFaultAddr; + u64 FaultPC; +}; + +bool FaultHandler(FaultDescription* faultDesc, s32& offset); +} + +#if defined(__SWITCH__) +// with LTO the symbols seem to be not properly overriden +// if they're somewhere else + +extern "C" +{ + +void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + +extern char __start__; +extern char __rodata_start; + +alignas(16) u8 __nx_exception_stack[0x8000]; +u64 __nx_exception_stack_size = 0x8000; + +void __libnx_exception_handler(ThreadExceptionDump* ctx) +{ + ARMJIT_Memory::FaultDescription desc; + desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w; + desc.FaultPC = ctx->pc.x; + + u64 integerRegisters[33]; + memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29); + integerRegisters[29] = ctx->fp.x; + integerRegisters[30] = ctx->lr.x; + integerRegisters[31] = ctx->sp.x; + integerRegisters[32] = ctx->pc.x; + + s32 offset; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + integerRegisters[32] += offset; + + ARM_RestoreContext(integerRegisters); + } + + if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) + { + printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); + } + else + { + printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + } +} + +} + +#elif defined(_WIN32) + +static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) +{ + if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION) + return EXCEPTION_CONTINUE_SEARCH; + + ARMJIT_Memory::FaultDescription desc; + desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx; + desc.FaultPC = exceptionInfo->ContextRecord->Rip; + + s32 offset = 0; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + exceptionInfo->ContextRecord->Rip += offset; + return EXCEPTION_CONTINUE_EXECUTION; + } + + return EXCEPTION_CONTINUE_SEARCH; +} + +#endif + +namespace ARMJIT_Memory +{ + +void* FastMem9Start, *FastMem7Start; + +#ifdef _WIN32 +inline u32 RoundUp(u32 size) +{ + return (size + 0xFFFF) & ~0xFFFF; +} +#else +inline u32 RoundUp(u32 size) +{ + return size; +} +#endif + +const u32 MemBlockMainRAMOffset = 0; +const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize); +const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize); +const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize); +const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize); +const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize); +const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize); +const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize); + +const u32 OffsetsPerRegion[memregions_Count] = +{ + UINT32_MAX, + UINT32_MAX, + MemBlockDTCMOffset, + UINT32_MAX, + MemBlockMainRAMOffset, + MemBlockSWRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockARM7WRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockNWRAM_AOffset, + MemBlockNWRAM_BOffset, + MemBlockNWRAM_COffset +}; + +enum +{ + memstate_Unmapped, + memstate_MappedRW, + // on switch this is unmapped as well + memstate_MappedProtected, +}; + +u8 MappingStatus9[1 << (32-12)]; +u8 MappingStatus7[1 << (32-12)]; + +#if defined(__SWITCH__) +u8* MemoryBase; +u8* MemoryBaseCodeMem; +#elif defined(_WIN32) +u8* MemoryBase; +HANDLE MemoryFile; +LPVOID ExceptionHandlerHandle; +#endif + +bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size)); + return R_SUCCEEDED(r); +#elif defined(_WIN32) + bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst; + return r; +#endif +} + +bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size); + return R_SUCCEEDED(r); +#else + return UnmapViewOfFile(dst); +#endif +} + +void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#if defined(_WIN32) + DWORD winProtection, oldProtection; + if (protection == 0) + winProtection = PAGE_NOACCESS; + else if (protection == 1) + winProtection = PAGE_READONLY; + else + winProtection = PAGE_READWRITE; + VirtualProtect(dst, size, winProtection, &oldProtection); +#endif +} + +struct Mapping +{ + u32 Addr; + u32 Size, LocalOffset; + u32 Num; + + void Unmap(int region) + { + bool skipDTCM = Num == 0 && region != memregion_DTCM; + u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7; + u32 offset = 0; + while (offset < Size) + { + if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + } + else + { + u32 segmentOffset = offset; + u8 status = statuses[(Addr + offset) >> 12]; + while (statuses[(Addr + offset) >> 12] == status + && offset < Size + && (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase)) + { + assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); + statuses[(Addr + offset) >> 12] = memstate_Unmapped; + offset += 0x1000; + } + +#ifdef __SWITCH__ + if (status == memstate_MappedRW) + { + u32 segmentSize = offset - segmentOffset; + printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + assert(success); + } +#endif + } + } +#if defined(_WIN32) + UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); +#endif + } +}; +ARMJIT::TinyVector<Mapping> Mappings[memregions_Count]; + +void SetCodeProtection(int region, u32 offset, bool protect) +{ + offset &= ~0xFFF; + printf("set code protection %d %x %d\n", region, offset, protect); + + for (int i = 0; i < Mappings[region].Length; i++) + { + Mapping& mapping = Mappings[region][i]; + + u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size) + continue; + if (mapping.Num == 0 + && region != memregion_DTCM + && effectiveAddr >= NDS::ARM9->DTCMBase + && effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + + u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); + + printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]); + assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); + states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; + +#if defined(__SWITCH__) + bool success; + if (protect) + success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + else + success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + assert(success); +#elif defined(_WIN32) + SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2); +#endif + } +} + +void RemapDTCM(u32 newBase, u32 newSize) +{ + // this first part could be made more efficient + // by unmapping DTCM first and then map the holes + u32 oldDTCMBase = NDS::ARM9->DTCMBase; + u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize; + + u32 newEnd = newBase + newSize; + + printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd); + // unmap all regions containing the old or the current DTCM mapping + for (int region = 0; region < memregions_Count; region++) + { + if (region == memregion_DTCM) + continue; + + for (int i = 0; i < Mappings[region].Length;) + { + Mapping& mapping = Mappings[region][i]; + + u32 start = mapping.Addr; + u32 end = mapping.Addr + mapping.Size; + + printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); + + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start); + bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start); + + if (mapping.Num == 0 && (oldOverlap || newOverlap)) + { + mapping.Unmap(region); + Mappings[region].Remove(i); + } + else + { + i++; + } + } + } + + for (int i = 0; i < Mappings[memregion_DTCM].Length; i++) + { + Mappings[memregion_DTCM][i].Unmap(memregion_DTCM); + } + Mappings[memregion_DTCM].Clear(); +} + +void RemapNWRAM(int num) +{ + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;) + { + Mapping& mapping = Mappings[memregion_SharedWRAM][i]; + if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size + || DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr)) + { + mapping.Unmap(memregion_SharedWRAM); + Mappings[memregion_SharedWRAM].Remove(i); + } + else + { + i++; + } + } + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num); + } + Mappings[memregion_NewSharedWRAM_A + num].Clear(); +} + +void RemapSWRAM() +{ + printf("remapping SWRAM\n"); + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++) + { + Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM); + } + Mappings[memregion_SharedWRAM].Clear(); + for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) + { + Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); + } + Mappings[memregion_WRAM7].Clear(); + for (int j = 0; j < 3; j++) + { + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j); + } + Mappings[memregion_NewSharedWRAM_A + j].Clear(); + } +} + +bool MapAtAddress(u32 addr) +{ + u32 num = NDS::CurCPU; + + int region = num == 0 + ? ClassifyAddress9(addr) + : ClassifyAddress7(addr); + + if (!IsFastmemCompatible(region)) + return false; + + return false; + + u32 mirrorStart, mirrorSize, memoryOffset; + bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize); + if (!isMapped) + return false; + + u8* states = num == 0 ? MappingStatus9 : MappingStatus7; + printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num); + bool isExecutable = ARMJIT::CodeMemRegions[region]; + +#if defined(_WIN32) + bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); + assert(succeded); +#endif + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512; + + // this overcomplicated piece of code basically just finds whole pieces of code memory + // which can be mapped + u32 offset = 0; + bool skipDTCM = num == 0 && region != memregion_DTCM; + while (offset < mirrorSize) + { + if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) + { + SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0); + offset += NDS::ARM9->DTCMSize; + } + else + { + u32 sectionOffset = offset; + bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); + while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) + && offset < mirrorSize + && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) + { + assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); + states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW; + offset += 0x1000; + } + + u32 sectionSize = offset - sectionOffset; + +#if defined(__SWITCH__) + if (!hasCode) + { + printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); + bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); + assert(succeded); + } +#elif defined(_WIN32) + if (hasCode) + { + SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1); + } +#endif + } + } + + assert(num == 0 || num == 1); + Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num}; + Mappings[region].Add(mapping); + + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1); + + return true; +} + +bool FaultHandler(FaultDescription* faultDesc, s32& offset) +{ + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC)) + { + bool rewriteToSlowPath = true; + + u32 addr = faultDesc->EmulatedFaultAddr; + + if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) + rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr); + + if (rewriteToSlowPath) + { + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC); + } + return true; + } + return false; +} + +void Init() +{ +#if defined(__SWITCH__) + MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize); + MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + (u64)MemoryBase, MemoryTotalSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + MemoryTotalSize, Perm_Rw)); + assert(succeded); + + // 8 GB of address space, just don't ask... + FastMem9Start = virtmemReserve(0x100000000); + assert(FastMem9Start); + FastMem7Start = virtmemReserve(0x100000000); + assert(FastMem7Start); + + u8* basePtr = MemoryBaseCodeMem; +#elif defined(_WIN32) + ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler); + + MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL); + + MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE); + + FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + + // only free them after they have all been reserved + // so they can't overlap + VirtualFree(MemoryBase, 0, MEM_RELEASE); + VirtualFree(FastMem9Start, 0, MEM_RELEASE); + VirtualFree(FastMem7Start, 0, MEM_RELEASE); + + MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase); + + u8* basePtr = MemoryBase; +#endif + NDS::MainRAM = basePtr + MemBlockMainRAMOffset; + NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset; + NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset; + DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset; + DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset; + DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset; +} + +void DeInit() +{ +#if defined(__SWITCH__) + virtmemFree(FastMem9Start, 0x100000000); + virtmemFree(FastMem7Start, 0x100000000); + + svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); + virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); + free(MemoryBase); +#elif defined(_WIN32) + assert(UnmapViewOfFile(MemoryBase)); + CloseHandle(MemoryFile); + + RemoveVectoredExceptionHandler(ExceptionHandlerHandle); +#endif +} + +void Reset() +{ + for (int region = 0; region < memregions_Count; region++) + { + for (int i = 0; i < Mappings[region].Length; i++) + Mappings[region][i].Unmap(region); + Mappings[region].Clear(); + } + + for (int i = 0; i < sizeof(MappingStatus9); i++) + { + assert(MappingStatus9[i] == memstate_Unmapped); + assert(MappingStatus7[i] == memstate_Unmapped); + } + + printf("done resetting jit mem\n"); +} + +bool IsFastmemCompatible(int region) +{ +#ifdef _WIN32 + /* + TODO: with some hacks, the smaller shared WRAM regions + could be mapped in some occaisons as well + */ + if (region == memregion_DTCM + || region == memregion_SharedWRAM + || region == memregion_NewSharedWRAM_B + || region == memregion_NewSharedWRAM_C) + return false; +#endif + return OffsetsPerRegion[region] != UINT32_MAX; +} + +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize) +{ + memoryOffset = 0; + switch (region) + { + case memregion_ITCM: + if (num == 0) + { + mirrorStart = addr & ~(ITCMPhysicalSize - 1); + mirrorSize = ITCMPhysicalSize; + return true; + } + return false; + case memregion_MainRAM: + mirrorStart = addr & ~NDS::MainRAMMask; + mirrorSize = NDS::MainRAMMask + 1; + return true; + case memregion_BIOS9: + if (num == 0) + { + mirrorStart = addr & ~0xFFF; + mirrorSize = 0x1000; + return true; + } + return false; + case memregion_BIOS7: + if (num == 1) + { + mirrorStart = 0; + mirrorSize = 0x4000; + return true; + } + return false; + case memregion_SharedWRAM: + if (num == 0 && NDS::SWRAM_ARM9.Mem) + { + mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask; + mirrorSize = NDS::SWRAM_ARM9.Mask + 1; + memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; + return true; + } + else if (num == 1 && NDS::SWRAM_ARM7.Mem) + { + mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask; + mirrorSize = NDS::SWRAM_ARM7.Mask + 1; + memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1); + mirrorSize = NDS::ARM7WRAMSize; + return true; + } + return false; + case memregion_VRAM: + if (num == 0) + { + mirrorStart = addr & ~0xFFFFF; + mirrorSize = 0x100000; + } + return false; + case memregion_VWRAM: + if (num == 1) + { + mirrorStart = addr & ~0x3FFFF; + mirrorSize = 0x40000; + return true; + } + return false; + case memregion_NewSharedWRAM_A: + { + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) + { + memoryOffset = ptr - DSi::NWRAM_A; + mirrorStart = addr & ~0xFFFF; + mirrorSize = 0x10000; + return true; + } + return false; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) + { + memoryOffset = ptr - DSi::NWRAM_B; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; + } + return false; // zero filled memory + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + { + memoryOffset = ptr - DSi::NWRAM_C; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; + } + return false; // zero filled memory + } + case memregion_BIOS9DSi: + if (num == 0) + { + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000; + return true; + } + return false; + case memregion_BIOS7DSi: + if (num == 1) + { + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000; + return true; + } + return false; + default: + assert(false && "For the time being this should only be used for code"); + return false; + } +} + +u32 LocaliseAddress(int region, u32 num, u32 addr) +{ + switch (region) + { + case memregion_ITCM: + return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27); + case memregion_MainRAM: + return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27); + case memregion_BIOS9: + return (addr & 0xFFF) | (memregion_BIOS9 << 27); + case memregion_BIOS7: + return (addr & 0x3FFF) | (memregion_BIOS7 << 27); + case memregion_SharedWRAM: + if (num == 0) + return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + else + return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + case memregion_WRAM7: + return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27); + case memregion_VRAM: + // TODO: take mapping properly into account + return (addr & 0xFFFFF) | (memregion_VRAM << 27); + case memregion_VWRAM: + // same here + return (addr & 0x3FFFF) | (memregion_VWRAM << 27); + case memregion_NewSharedWRAM_A: + { + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) + return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27); + else + return memregion_Other << 27; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) + return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27); + else + return memregion_Other << 27; + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27); + else + return memregion_Other << 27; + } + case memregion_BIOS9DSi: + case memregion_BIOS7DSi: + return (addr & 0xFFFF) | (region << 27); + default: + assert(false && "This should only be needed for regions which can contain code"); + return memregion_Other << 27; + } +} + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + { + return memregion_ITCM; + } + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + { + return memregion_DTCM; + } + else + { + if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1))) + { + if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0))) + return memregion_Other; + + return memregion_BIOS9DSi; + } + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + { + return memregion_BIOS9; + } + + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2]) + return memregion_NewSharedWRAM_C; + } + + if (NDS::SWRAM_ARM9.Mem) + return memregion_SharedWRAM; + return memregion_Other; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + default: + return memregion_Other; + } + } +} + +int ClassifyAddress7(u32 addr) +{ + if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9))) + { + if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8)) + return memregion_Other; + + return memregion_BIOS7DSi; + } + else if (addr < 0x00004000) + { + return memregion_BIOS7; + } + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2]) + return memregion_NewSharedWRAM_C; + } + + if (NDS::SWRAM_ARM7.Mem) + return memregion_SharedWRAM; + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template <typename T> +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return; + default: GPU::WriteVRAM_LCDC<T>(addr, val); return; + } +} +template <typename T> +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr); + case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr); + default: return GPU::ReadVRAM_LCDC<T>(addr); + } +} + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + switch (addr & 0xFF000000) + { + case 0x04000000: + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + if (NDS::ConsoleType == 0) + { + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + else + { + switch (size | store) + { + case 8: return (void*)DSi::ARM9IORead8; + case 9: return (void*)DSi::ARM9IOWrite8; + case 16: return (void*)DSi::ARM9IORead16; + case 17: return (void*)DSi::ARM9IOWrite16; + case 32: return (void*)DSi::ARM9IORead32; + case 33: return (void*)DSi::ARM9IOWrite32; + } + } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead<u8>; + case 9: return NULL; + case 16: return (void*)VRAMRead<u16>; + case 17: return (void*)VRAMWrite<u16>; + case 32: return (void*)VRAMRead<u32>; + case 33: return (void*)VRAMWrite<u32>; + } + break; + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + if (NDS::ConsoleType == 0) + { + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + } + else + { + switch (size | store) + { + case 8: return (void*)DSi::ARM7IORead8; + case 9: return (void*)DSi::ARM7IOWrite8; + case 16: return (void*)DSi::ARM7IORead16; + case 17: return (void*)DSi::ARM7IOWrite16; + case 32: return (void*)DSi::ARM7IORead32; + case 33: return (void*)DSi::ARM7IOWrite32; + } + } + break; + case 0x04800000: + if (addr < 0x04810000 && size >= 16) + { + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } + } + break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7<u8>; + case 9: return (void*)GPU::WriteVRAM_ARM7<u8>; + case 16: return (void*)GPU::ReadVRAM_ARM7<u16>; + case 17: return (void*)GPU::WriteVRAM_ARM7<u16>; + case 32: return (void*)GPU::ReadVRAM_ARM7<u32>; + case 33: return (void*)GPU::WriteVRAM_ARM7<u32>; + } + } + } + return NULL; +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h new file mode 100644 index 0000000..123e18e --- /dev/null +++ b/src/ARMJIT_Memory.h @@ -0,0 +1,63 @@ +#ifndef ARMJIT_MEMORY +#define ARMJIT_MEMORY + +#include "types.h" + +#include "ARM.h" + +namespace ARMJIT_Memory +{ + +extern void* FastMem9Start; +extern void* FastMem7Start; + +void Init(); +void DeInit(); + +void Reset(); + +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SharedWRAM, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, + + // DSi + memregion_BIOS9DSi, + memregion_BIOS7DSi, + memregion_NewSharedWRAM_A, + memregion_NewSharedWRAM_B, + memregion_NewSharedWRAM_C, + + memregions_Count +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize); +u32 LocaliseAddress(int region, u32 num, u32 addr); + +bool IsFastmemCompatible(int region); + +void RemapDTCM(u32 newBase, u32 newSize); +void RemapSWRAM(); +void RemapNWRAM(int num); + +void SetCodeProtection(int region, u32 offset, bool protect); + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h new file mode 100644 index 0000000..0547c84 --- /dev/null +++ b/src/ARMJIT_RegisterCache.h @@ -0,0 +1,199 @@ +#ifndef ARMJIT_REGCACHE_H +#define ARMJIT_REGCACHE_H + +#include "ARMJIT.h" + +// TODO: replace this in the future +#include "dolphin/BitSet.h" + +#include <assert.h> + +namespace ARMJIT +{ + +template <typename T, typename Reg> +class RegisterCache +{ +public: + RegisterCache() + {} + + RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount, bool pcAllocatableAsSrc = false) + : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) + { + for (int i = 0; i < 16; i++) + Mapping[i] = (Reg)-1; + + PCAllocatableAsSrc = ~(pcAllocatableAsSrc + ? 0 + : (1 << 15)); + } + + void UnloadRegister(int reg) + { + assert(Mapping[reg] != -1); + + if (DirtyRegs & (1 << reg)) + Compiler->SaveReg(reg, Mapping[reg]); + + DirtyRegs &= ~(1 << reg); + LoadedRegs &= ~(1 << reg); + NativeRegsUsed &= ~(1 << (int)Mapping[reg]); + Mapping[reg] = (Reg)-1; + } + + void LoadRegister(int reg, bool loadValue) + { + assert(Mapping[reg] == -1); + for (int i = 0; i < NativeRegsAvailable; i++) + { + Reg nativeReg = NativeRegAllocOrder[i]; + if (!(NativeRegsUsed & (1 << nativeReg))) + { + Mapping[reg] = nativeReg; + NativeRegsUsed |= 1 << (int)nativeReg; + LoadedRegs |= 1 << reg; + + if (loadValue) + Compiler->LoadReg(reg, nativeReg); + + return; + } + } + + assert("Welp!"); + } + + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + + void Flush() + { + BitSet16 loadedSet(LoadedRegs); + for (int reg : loadedSet) + UnloadRegister(reg); + LiteralsLoaded = 0; + } + + void Prepare(bool thumb, int i) + { + FetchedInstr instr = Instrs[i]; + + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + + u16 futureNeeded = 0; + int ranking[16]; + for (int j = 0; j < 16; j++) + ranking[j] = 0; + for (int j = i; j < InstrsCount; j++) + { + BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); + futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); + for (int reg : regsNeeded) + ranking[reg]++; + } + + // we'll unload all registers which are never used again + BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded); + for (int reg : neverNeededAgain) + UnloadRegister(reg); + + u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); + if (needToBeLoaded != BitSet16(0)) + { + int neededCount = needToBeLoaded.Count(); + BitSet16 loadedSet(LoadedRegs); + while (loadedSet.Count() + neededCount > NativeRegsAvailable) + { + int leastReg = -1; + int rank = 1000; + for (int reg : loadedSet) + { + if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank) + { + leastReg = reg; + rank = ranking[reg]; + } + } + + assert(leastReg != -1); + UnloadRegister(leastReg); + + loadedSet.m_val = LoadedRegs; + } + + // we don't need to load a value which is always going to be overwritten + BitSet16 needValueLoaded(needToBeLoaded); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); + for (int reg : needToBeLoaded) + LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } + } + + DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15); + } + + static const Reg NativeRegAllocOrder[]; + static const int NativeRegsAvailable; + + Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; + u32 NativeRegsUsed = 0; + u16 LoadedRegs = 0; + u16 DirtyRegs = 0; + + u16 PCAllocatableAsSrc = 0; + + T* Compiler; + + FetchedInstr* Instrs; + int InstrsCount; +}; + +} + +#endif diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp new file mode 100644 index 0000000..43b94b6 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -0,0 +1,768 @@ +#include "ARMJIT_Compiler.h" + +using namespace Gen; + +namespace ARMJIT +{ + +// uses RSCRATCH3 +void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&), + OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (rd == rn && !(opFlags & opInvertOp2)) + (this->*op)(32, rd, op2); + else if (opFlags & opSymmetric && op2 == R(RSCRATCH)) + { + if (opFlags & opInvertOp2) + NOT(32, op2); + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + } + else + { + if (opFlags & opInvertOp2) + { + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + NOT(32, op2); + } + MOV(32, R(RSCRATCH3), rn); + (this->*op)(32, R(RSCRATCH3), op2); + MOV(32, rd, R(RSCRATCH3)); + } + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) +{ + switch (op) + { + case 0: // TST + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + TEST(32, rn, op2); + break; + case 1: // TEQ + MOV(32, R(RSCRATCH3), rn); + XOR(32, R(RSCRATCH3), op2); + break; + case 2: // CMP + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + CMP(32, rn, op2); + break; + case 3: // CMN + MOV(32, R(RSCRATCH3), rn); + ADD(32, R(RSCRATCH3), op2); + break; + } + + Comp_RetriveFlags(op == 2, op >= 2, carryUsed); +} + +// also calculates cycles +OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) +{ + if (CurInstr.Instr & (1 << 25)) + { + Comp_AddCycles_C(); + carryUsed = false; + return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E)); + } + else + { + S = S && (CurInstr.SetFlags & 0x2); + + int op = (CurInstr.Instr >> 5) & 0x3; + if (CurInstr.Instr & (1 << 4)) + { + Comp_AddCycles_CI(1); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + if (rm.IsImm() && CurInstr.A_Reg(0) == 15) + rm = Imm32(rm.Imm32() + 4); + return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed); + } + else + { + Comp_AddCycles_C(); + return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F, + MapReg(CurInstr.A_Reg(0)), S, carryUsed); + } + } +} + +void Compiler::A_Comp_CmpOp() +{ + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed); + + Comp_CmpOp(op - 0x8, rn, op2, carryUsed); +} + +void Compiler::A_Comp_Arith() +{ + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed); + + u32 sFlag = S ? opSetsFlags : 0; + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); + break; + case 0x1: // EOR + Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + break; + case 0x2: // SUB + Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + break; + case 0x3: // RSB + if (op2.IsZero()) + { + if (rd != rn) + MOV(32, rd, rn); + NEG(32, rd); + if (S) + Comp_RetriveFlags(true, true, false); + } + else + Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + break; + case 0x4: // ADD + Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); + break; + case 0x5: // ADC + Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); + break; + case 0x6: // SBC + Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + break; + case 0x7: // RSC + Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); + break; + case 0xC: // ORR + Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + break; + case 0xE: // BIC + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); + break; + default: + assert("unimplemented"); + } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); +} + +void Compiler::A_Comp_MovOp() +{ + bool carryUsed; + bool S = CurInstr.Instr & (1 << 20); + OpArg op2 = A_Comp_GetALUOp2(S, carryUsed); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + + if (rd != op2) + MOV(32, rd, op2); + + if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { + NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); + + if (S) + { + if (FlagsNZRequired()) + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); + } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); +} + +void Compiler::A_Comp_CLZ() +{ + OpArg rd = MapReg(CurInstr.A_Reg(12)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + + MOV(32, R(RSCRATCH), Imm32(32)); + TEST(32, rm, rm); + FixupBranch skipZero = J_CC(CC_Z); + BSR(32, RSCRATCH, rm); + XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH + SetJumpTarget(skipZero); + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn) +{ + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1); + } + + static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!"); + MOV(32, R(RSCRATCH), rm); + if (add) + { + IMUL(32, RSCRATCH, rs); + LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg())); + if (S && FlagsNZRequired()) + TEST(32, rd, rd); + } + else + { + IMUL(32, RSCRATCH, rs); + MOV(32, rd, R(RSCRATCH)); + if (S && FlagsNZRequired()) + TEST(32, R(RSCRATCH), R(RSCRATCH)); + } + + if (S) + Comp_RetriveFlags(false, false, false); +} + +void Compiler::A_Comp_MUL_MLA() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn; + if (add) + rn = MapReg(CurInstr.A_Reg(12)); + + Comp_MulOp(S, add, rd, rm, rs, rn); +} + +void Compiler::A_Comp_Mul_Long() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + bool sign = CurInstr.Instr & (1 << 22); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn = MapReg(CurInstr.A_Reg(12)); + + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + if (sign) + { + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + } + else + { + BSR(32, RSCRATCH, R(RSCRATCH3)); + } + + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, 2); + } + + if (sign) + { + MOVSX(64, 32, RSCRATCH2, rm); + MOVSX(64, 32, RSCRATCH3, rs); + } + else + { + MOV(32, R(RSCRATCH2), rm); + MOV(32, R(RSCRATCH3), rs); + } + if (add) + { + MOV(32, R(RSCRATCH), rd); + SHL(64, R(RSCRATCH), Imm8(32)); + OR(64, R(RSCRATCH), rn); + + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + ADD(64, R(RSCRATCH2), R(RSCRATCH)); + } + else + { + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + if (S && FlagsNZRequired()) + TEST(64, R(RSCRATCH2), R(RSCRATCH2)); + } + + if (S) + Comp_RetriveFlags(false, false, false); + + MOV(32, rn, R(RSCRATCH2)); + SHR(64, R(RSCRATCH2), Imm8(32)); + MOV(32, rd, R(RSCRATCH2)); +} + +void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) +{ + if (CurInstr.SetFlags == 0) + return; + if (retriveCV && !(CurInstr.SetFlags & 0x3)) + retriveCV = false; + + bool carryOnly = !retriveCV && carryUsed; + if (carryOnly && !(CurInstr.SetFlags & 0x2)) + { + carryUsed = false; + carryOnly = false; + } + + CPSRDirty = true; + + if (retriveCV) + { + SETcc(CC_O, R(RSCRATCH)); + SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3)); + LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); + } + + if (FlagsNZRequired()) + { + SETcc(CC_S, R(RSCRATCH)); + SETcc(CC_Z, R(RSCRATCH3)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); + int shiftAmount = 30; + if (retriveCV || carryUsed) + { + LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); + shiftAmount = carryOnly ? 29 : 28; + } + SHL(32, R(RSCRATCH), Imm8(shiftAmount)); + + AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH)); + } + else if (carryUsed || retriveCV) + { + SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28)); + AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH2)); + } +} + +// always uses RSCRATCH, RSCRATCH2 only if S == true +OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = S; + + if (S) + { + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + TEST(32, R(RCPSR), Imm32(1 << 29)); + SETcc(CC_NZ, R(RSCRATCH2)); + } + + MOV(32, R(RSCRATCH), rm); + static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3"); + MOV(32, R(ECX), rs); + AND(32, R(ECX), Imm32(0xFF)); + + FixupBranch zero = J_CC(CC_Z); + if (op < 3) + { + void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL; + if (op == 0) + shiftOp = &Compiler::SHL; + else if (op == 1) + shiftOp = &Compiler::SHR; + else if (op == 2) + shiftOp = &Compiler::SAR; + + CMP(32, R(ECX), Imm8(32)); + FixupBranch lt32 = J_CC(CC_L); + FixupBranch done1; + if (op < 2) + { + FixupBranch eq32 = J_CC(CC_E); + XOR(32, R(RSCRATCH), R(RSCRATCH)); + if (S) + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + done1 = J(); + SetJumpTarget(eq32); + } + (this->*shiftOp)(32, R(RSCRATCH), Imm8(31)); + (this->*shiftOp)(32, R(RSCRATCH), Imm8(1)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + FixupBranch done2 = J(); + + SetJumpTarget(lt32); + (this->*shiftOp)(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + if (op < 2) + SetJumpTarget(done1); + SetJumpTarget(done2); + + } + else if (op == 3) + { + if (S) + BT(32, R(RSCRATCH), Imm8(31)); + ROR_(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + } + SetJumpTarget(zero); + + return R(RSCRATCH); +} + +// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue +OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = true; + + switch (op) + { + case 0: // LSL + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHL(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + return R(RSCRATCH); + } + else + { + carryUsed = false; + return rm; + } + case 1: // LSR + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHR(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), rm); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), rm); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + BT(32, rm, Imm8(31)); + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + MOV(32, R(RSCRATCH), rm); + if (amount > 0) + ROR_(32, R(RSCRATCH), Imm8(amount)); + else + { + BT(32, R(RCPSR), Imm8(29)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + + assert(false); +} + +void Compiler::T_Comp_ShiftImm() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + + int op = (CurInstr.Instr >> 11) & 0x3; + int amount = (CurInstr.Instr >> 6) & 0x1F; + + Comp_AddCycles_C(); + + bool carryUsed; + OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed); + + if (shifted != rd) + MOV(32, rd, shifted); + + if (FlagsNZRequired()) + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); +} + +void Compiler::T_Comp_AddSub_() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + + int op = (CurInstr.Instr >> 9) & 0x3; + + OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6)); + + Comp_AddCycles_C(); + + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) + Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); + else + Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); +} + +void Compiler::T_Comp_ALU_Imm8() +{ + OpArg rd = MapReg(CurInstr.T_Reg(8)); + + u32 op = (CurInstr.Instr >> 11) & 0x3; + OpArg imm = Imm32(CurInstr.Instr & 0xFF); + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: + MOV(32, rd, imm); + if (FlagsNZRequired()) + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + case 0x1: + Comp_CmpOp(2, rd, imm, false); + return; + case 0x2: + Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + return; + case 0x3: + Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + return; + } +} + +void Compiler::T_Comp_MUL() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + Comp_MulOp(true, false, rd, rd, rs, Imm8(-1)); +} + +void Compiler::T_Comp_ALU() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + + u32 op = (CurInstr.Instr >> 6) & 0xF; + + if ((op >= 0x2 && op < 0x4) || op == 0x7) + Comp_AddCycles_CI(1); // shift by reg + else + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x1: // EOR + Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x2: + case 0x3: + case 0x4: + case 0x7: + { + int shiftOp = op == 0x7 ? 3 : op - 0x2; + bool carryUsed; + OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); + if (FlagsNZRequired()) + TEST(32, shifted, shifted); + MOV(32, rd, shifted); + Comp_RetriveFlags(false, false, true); + } + return; + case 0x5: // ADC + Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); + return; + case 0x6: // SBC + Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); + return; + case 0x8: // TST + Comp_CmpOp(0, rd, rs, false); + return; + case 0x9: // NEG + if (rd != rs) + MOV(32, rd, rs); + NEG(32, rd); + Comp_RetriveFlags(true, true, false); + return; + case 0xA: // CMP + Comp_CmpOp(2, rd, rs, false); + return; + case 0xB: // CMN + Comp_CmpOp(3, rd, rs, false); + return; + case 0xC: // ORR + Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0xE: // BIC + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); + return; + case 0xF: // MVN + if (rd != rs) + MOV(32, rd, rs); + NOT(32, rd); + Comp_RetriveFlags(false, false, false); + return; + default: + break; + } +} + +void Compiler::T_Comp_ALU_HiReg() +{ + u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8)); + OpArg rdMapped = MapReg(rd); + OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF); + + u32 op = (CurInstr.Instr >> 8) & 0x3; + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // ADD + Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric); + break; + case 0x1: // CMP + Comp_CmpOp(2, rdMapped, rs, false); + return; // this is on purpose + case 0x2: // MOV + if (rdMapped != rs) + MOV(32, rdMapped, rs); + break; + } + + if (rd == 15) + { + OR(32, rdMapped, Imm8(1)); + Comp_JumpTo(rdMapped.GetSimpleReg()); + } +} + +void Compiler::T_Comp_AddSP() +{ + Comp_AddCycles_C(); + + OpArg sp = MapReg(13); + OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2); + if (CurInstr.Instr & (1 << 7)) + SUB(32, sp, offset); + else + ADD(32, sp, offset); +} + +void Compiler::T_Comp_RelAddr() +{ + Comp_AddCycles_C(); + + OpArg rd = MapReg(CurInstr.T_Reg(8)); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + if (CurInstr.Instr & (1 << 11)) + { + OpArg sp = MapReg(13); + LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset)); + } + else + MOV(32, rd, Imm32((R15 & ~2) + offset)); +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp new file mode 100644 index 0000000..bda9e52 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -0,0 +1,272 @@ +#include "ARMJIT_Compiler.h" + +using namespace Gen; + +namespace ARMJIT +{ + +template <typename T> +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} + +void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) +{ + // we can simplify constant branches by a lot + IrregularCycles = true; + + u32 newPC; + u32 cycles = 0; + + if (addr & 0x1 && !Thumb) + { + CPSRDirty = true; + OR(32, R(RCPSR), Imm8(0x20)); + } + else if (!(addr & 0x1) && Thumb) + { + CPSRDirty = true; + AND(32, R(RCPSR), Imm32(~0x20)); + } + + if (Num == 0) + { + ARMv5* cpu9 = (ARMv5*)CurCPU; + + u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; + u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; + cpu9->RegionCodeCycles = regionCodeCycles; + + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // two-opcodes-at-once fetch + // doesn't matter if we put garbage in the MSbs there + if (addr & 0x2) + { + cpu9->CodeRead32(addr-2, true); + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+2, false); + cycles += CurCPU->CodeCycles; + } + else + { + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + } + } + else + { + addr &= ~0x3; + newPC = addr+4; + + cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + cpu9->CodeRead32(addr+4, false); + cycles += cpu9->CodeCycles; + } + + cpu9->RegionCodeCycles = compileTimeCodeCycles; + } + else + { + ARMv4* cpu7 = (ARMv4*)CurCPU; + + u32 codeRegion = addr >> 24; + u32 codeCycles = addr >> 15; // cheato + + cpu7->CodeRegion = codeRegion; + cpu7->CodeCycles = codeCycles; + + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // this is necessary because ARM7 bios protection + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; + + CurCPU->R[15] = compileTimePC; + } + else + { + addr &= ~0x3; + newPC = addr+4; + + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + + cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; + + CurCPU->R[15] = compileTimePC; + } + + cpu7->CodeRegion = R15 >> 24; + cpu7->CodeCycles = addr >> 15; + } + + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) + ConstantCycles += cycles; + else + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); +} + +void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) +{ + IrregularCycles = true; + + bool cpsrDirty = CPSRDirty; + SaveCPSR(); + + PushRegs(restoreCPSR); + + MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(32, R(ABI_PARAM2), R(addr)); + if (!restoreCPSR) + XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); + else + MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste + if (Num == 0) + CALL((void*)&ARMv5::JumpTo); + else + CALL((void*)&ARMv4::JumpTo); + + PopRegs(restoreCPSR); + + LoadCPSR(); + // in case this instruction is skipped + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; +} + +void Compiler::A_Comp_BranchImm() +{ + int op = (CurInstr.Instr >> 24) & 1; + s32 offset = (s32)(CurInstr.Instr << 8) >> 6; + u32 target = R15 + offset; + bool link = op; + + if (CurInstr.Cond() == 0xF) // BLX_imm + { + target += (op << 1) + 1; + link = true; + } + + if (link) + MOV(32, MapReg(14), Imm32(R15 - 4)); + + Comp_JumpTo(target); +} + +void Compiler::A_Comp_BranchXchangeReg() +{ + OpArg rn = MapReg(CurInstr.A_Reg(0)); + MOV(32, R(RSCRATCH), rn); + if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg + MOV(32, MapReg(14), Imm32(R15 - 4)); + Comp_JumpTo(RSCRATCH); +} + +void Compiler::T_Comp_BCOND() +{ + u32 cond = (CurInstr.Instr >> 8) & 0xF; + FixupBranch skipExecute = CheckCondition(cond); + + s32 offset = (s32)(CurInstr.Instr << 24) >> 23; + Comp_JumpTo(R15 + offset + 1, true); + + Comp_SpecialBranchBehaviour(true); + + FixupBranch skipFailed = J(); + SetJumpTarget(skipExecute); + + Comp_SpecialBranchBehaviour(false); + + Comp_AddCycles_C(true); + SetJumpTarget(skipFailed); +} + +void Compiler::T_Comp_B() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20; + Comp_JumpTo(R15 + offset + 1); +} + +void Compiler::T_Comp_BranchXchangeReg() +{ + bool link = CurInstr.Instr & (1 << 7); + + if (link) + { + if (Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3))); + MOV(32, MapReg(14), Imm32(R15 - 1)); + Comp_JumpTo(RSCRATCH); + } + else + { + OpArg rn = MapReg(CurInstr.A_Reg(3)); + Comp_JumpTo(rn.GetSimpleReg()); + } +} + +void Compiler::T_Comp_BL_LONG_1() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9; + MOV(32, MapReg(14), Imm32(R15 + offset)); + Comp_AddCycles_C(); +} + +void Compiler::T_Comp_BL_LONG_2() +{ + OpArg lr = MapReg(14); + s32 offset = (CurInstr.Instr & 0x7FF) << 1; + LEA(32, RSCRATCH, MDisp(lr.GetSimpleReg(), offset)); + MOV(32, lr, Imm32((R15 - 2) | 1)); + if (Num == 1 || CurInstr.Instr & (1 << 12)) + OR(32, R(RSCRATCH), Imm8(1)); + Comp_JumpTo(RSCRATCH); +} + +void Compiler::T_Comp_BL_Merged() +{ + Comp_AddCycles_C(); + + R15 += 2; + + u32 upperPart = CurInstr.Instr >> 16; + u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9); + target += (upperPart & 0x7FF) << 1; + + if (Num == 1 || upperPart & (1 << 12)) + target |= 1; + + MOV(32, MapReg(14), Imm32((R15 - 2) | 1)); + + Comp_JumpTo(target); +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp new file mode 100644 index 0000000..d8bdd56 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -0,0 +1,899 @@ +#include "ARMJIT_Compiler.h" + +#include "../ARMInterpreter.h" +#include "../Config.h" + +#include <assert.h> + +#include "../dolphin/CommonFuncs.h" + +#ifdef _WIN32 +#include <windows.h> +#else +#include <sys/mman.h> +#include <unistd.h> +#endif + +using namespace Gen; + +extern "C" void ARM_Ret(); + +namespace ARMJIT +{ +template <> +const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] = +{ +#ifdef _WIN32 + RBX, RSI, RDI, R12, R13, R14, // callee saved + R10, R11, // caller saved +#else + RBX, R12, R13, R14, // callee saved, this is sad + R9, R10, R11, // caller saved +#endif +}; +template <> +const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable = +#ifdef _WIN32 + 8 +#else + 7 +#endif +; + +#ifdef _WIN32 +const BitSet32 CallerSavedPushRegs({R10, R11}); +#else +const BitSet32 CallerSavedPushRegs({R9, R10, R11}); +#endif + +void Compiler::PushRegs(bool saveHiRegs) +{ + BitSet32 loadedRegs(RegCache.LoadedRegs); + + if (saveHiRegs) + { + BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + { + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.UnloadRegister(reg); + else + SaveReg(reg, RegCache.Mapping[reg]); + // prevent saving the register twice + loadedRegs[reg] = false; + } + } + + for (int reg : loadedRegs) + if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) + SaveReg(reg, RegCache.Mapping[reg]); +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + BitSet32 loadedRegs(RegCache.LoadedRegs); + for (int reg : loadedRegs) + { + if ((saveHiRegs && reg >= 8 && reg < 15) + || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) + { + LoadReg(reg, RegCache.Mapping[reg]); + } + } +} + +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + OpArg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + XOR(32, R(RSCRATCH3), R(RSCRATCH3)); + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); + CALL(ReadBanked); + MOV(32, rd, R(RSCRATCH3)); + } + else + MOV(32, rd, R(RCPSR)); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + OpArg val = CurInstr.Instr & (1 << 25) + ? Imm32(ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))) + : MapReg(CurInstr.A_Reg(0)); + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + XOR(32, R(RSCRATCH3), R(RSCRATCH3)); + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); + CALL(ReadBanked); + + MOV(32, R(RSCRATCH2), Imm32(mask)); + MOV(32, R(RSCRATCH4), R(RSCRATCH2)); + AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00)); + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x10)); + CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E); + + MOV(32, R(RSCRATCH4), R(RSCRATCH2)); + NOT(32, R(RSCRATCH4)); + AND(32, R(RSCRATCH3), R(RSCRATCH4)); + + AND(32, R(RSCRATCH2), val); + OR(32, R(RSCRATCH3), R(RSCRATCH2)); + + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); + CALL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + AND(32, R(RCPSR), Imm32(~mask)); + if (!val.IsImm()) + { + MOV(32, R(RSCRATCH), val); + AND(32, R(RSCRATCH), Imm32(mask)); + OR(32, R(RCPSR), R(RSCRATCH)); + } + else + { + OR(32, R(RCPSR), Imm32(val.Imm32() & mask)); + } + } + else + { + MOV(32, R(RSCRATCH2), Imm32(mask)); + MOV(32, R(RSCRATCH3), R(RSCRATCH2)); + AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00)); + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x10)); + CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E); + + MOV(32, R(RSCRATCH3), R(RCPSR)); + + // I need you ANDN + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + NOT(32, R(RSCRATCH)); + AND(32, R(RCPSR), R(RSCRATCH)); + + AND(32, R(RSCRATCH2), val); + OR(32, R(RCPSR), R(RSCRATCH2)); + + PushRegs(true); + + MOV(32, R(ABI_PARAM3), R(RCPSR)); + MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((void*)&ARM::UpdateMode); + + PopRegs(true); + } + } +} + +/* + We'll repurpose this .bss memory + + */ +u8 CodeMemory[1024 * 1024 * 32]; + +Compiler::Compiler() +{ + { + #ifdef _WIN32 + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + + u64 pageSize = (u64)sysInfo.dwPageSize; + #else + u64 pageSize = sysconf(_SC_PAGE_SIZE); + #endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; + + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + + ResetStart = pageAligned; + CodeMemSize = alignedSize; + } + + Reset(); + + { + // RSCRATCH mode + // RSCRATCH2 reg number + // RSCRATCH3 value in current mode + // ret - RSCRATCH3 + ReadBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(RSCRATCH2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + RET(); + + SetJumpTarget(fiq); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ))); + RET(); + SetJumpTarget(irq); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ))); + RET(); + SetJumpTarget(svc); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC))); + RET(); + SetJumpTarget(abt); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT))); + RET(); + SetJumpTarget(und); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND))); + RET(); + } + { + // RSCRATCH mode + // RSCRATCH2 reg n + // RSCRATCH3 value + // carry flag set if the register isn't banked + WriteBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(RSCRATCH2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + STC(); + RET(); + + SetJumpTarget(fiq); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3)); + CLC(); + RET(); + SetJumpTarget(irq); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3)); + CLC(); + RET(); + SetJumpTarget(svc); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3)); + CLC(); + RET(); + SetJumpTarget(abt); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3)); + CLC(); + RET(); + SetJumpTarget(und); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3)); + CLC(); + RET(); + } + + for (int consoleType = 0; consoleType < 2; consoleType++) + { + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 16; reg++) + { + if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3) + { + PatchedStoreFuncs[consoleType][num][size][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL; + continue; + } + + X64Reg rdMapped = (X64Reg)reg; + PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + MOV(32, R(ABI_PARAM3), R(rdMapped)); + } + else + { + MOV(32, R(ABI_PARAM2), R(rdMapped)); + } + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9<u32, 0>); break; + case 33: ABI_CallFunction(SlowWrite7<u32, 0>); break; + case 16: ABI_CallFunction(SlowWrite9<u16, 0>); break; + case 17: ABI_CallFunction(SlowWrite7<u16, 0>); break; + case 8: ABI_CallFunction(SlowWrite9<u8, 0>); break; + case 9: ABI_CallFunction(SlowWrite7<u8, 0>); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9<u32, 1>); break; + case 33: ABI_CallFunction(SlowWrite7<u32, 1>); break; + case 16: ABI_CallFunction(SlowWrite9<u16, 1>); break; + case 17: ABI_CallFunction(SlowWrite7<u16, 1>); break; + case 8: ABI_CallFunction(SlowWrite9<u8, 1>); break; + case 9: ABI_CallFunction(SlowWrite7<u8, 1>); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + MOV(64, R(ABI_PARAM2), R(RCPU)); + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9<u32, 0>); break; + case 33: ABI_CallFunction(SlowRead7<u32, 0>); break; + case 16: ABI_CallFunction(SlowRead9<u16, 0>); break; + case 17: ABI_CallFunction(SlowRead7<u16, 0>); break; + case 8: ABI_CallFunction(SlowRead9<u8, 0>); break; + case 9: ABI_CallFunction(SlowRead7<u8, 0>); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9<u32, 1>); break; + case 33: ABI_CallFunction(SlowRead7<u32, 1>); break; + case 16: ABI_CallFunction(SlowRead9<u16, 1>); break; + case 17: ABI_CallFunction(SlowRead7<u16, 1>); break; + case 8: ABI_CallFunction(SlowRead9<u8, 1>); break; + case 9: ABI_CallFunction(SlowRead7<u8, 1>); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (signextend) + MOVSX(32, 8 << size, rdMapped, R(RSCRATCH)); + else + MOVZX(32, 8 << size, rdMapped, R(RSCRATCH)); + RET(); + } + } + } + } + } + + // move the region forward to prevent overwriting the generated functions + CodeMemSize -= GetWritableCodePtr() - ResetStart; + ResetStart = GetWritableCodePtr(); + + NearStart = ResetStart; + FarStart = ResetStart + 1024*1024*24; + + NearSize = FarStart - ResetStart; + FarSize = (ResetStart + CodeMemSize) - FarStart; +} + +void Compiler::LoadCPSR() +{ + assert(!CPSRDirty); + + MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); +} + +void Compiler::SaveCPSR(bool flagClean) +{ + if (CPSRDirty) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); + if (flagClean) + CPSRDirty = false; + } +} + +void Compiler::LoadReg(int reg, X64Reg nativeReg) +{ + if (reg != 15) + MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg]))); + else + MOV(32, R(nativeReg), Imm32(R15)); +} + +void Compiler::SaveReg(int reg, X64Reg nativeReg) +{ + MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); +} + +// invalidates RSCRATCH and RSCRATCH3 +Gen::FixupBranch Compiler::CheckCondition(u32 cond) +{ + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); + if (cond >= 0x8) + { + static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); + MOV(32, R(RSCRATCH3), R(RCPSR)); + SHR(32, R(RSCRATCH3), Imm8(28)); + MOV(32, R(RSCRATCH), Imm32(1)); + SHL(32, R(RSCRATCH), R(RSCRATCH3)); + TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); + + return J_CC(CC_Z, ldmStm); + } + else + { + // could have used a LUT, but then where would be the fun? + TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); + + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); + } +} + +#define F(x) &Compiler::x +const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = +{ + // AND + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // EOR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SUB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADD + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SBC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ORR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MOV + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // BIC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MVN + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // TST + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // TEQ + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMP + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMN + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // Mul + F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + F(A_Comp_CLZ), NULL, NULL, NULL, NULL, + // STR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSB + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // swap + NULL, NULL, + // LDM/STM + F(A_Comp_LDM_STM), F(A_Comp_LDM_STM), + // Branch + F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), + // system stuff + NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL, + F(Nop) +}; + +const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { + // Shift imm + F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), + // Three operand ADD/SUB + F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), + // 8 bit imm + F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), + // general ALU + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU), + // hi reg + F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), + // pc/sp relative + F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP), + // LDR pcrel + F(T_Comp_LoadPCRel), + // LDR/STR reg offset + F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), + // LDR/STR sign extended, half + F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), + // LDR/STR imm offset + F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), + // LDR/STR half imm offset + F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf), + // LDR/STR sp rel + F(T_Comp_MemSPRel), F(T_Comp_MemSPRel), + // PUSH/POP + F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), + // LDMIA, STMIA + F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), + // Branch + F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), + // Unk, SVC + NULL, NULL, + F(T_Comp_BL_Merged) +}; +#undef F + +bool Compiler::CanCompile(bool thumb, u16 kind) +{ + return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; +} + +void Compiler::Reset() +{ + memset(ResetStart, 0xcc, CodeMemSize); + SetCodePtr(ResetStart); + + NearCode = NearStart; + FarCode = FarStart; + + LoadStorePatches.clear(); +} + +bool Compiler::IsJITFault(u64 addr) +{ + return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory); +} + +void Compiler::Comp_SpecialBranchBehaviour(bool taken) +{ + if (taken && CurInstr.BranchFlags & branch_IdleBranch) + OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); + + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) + { + RegCache.PrepareExit(); + + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + JMP((u8*)&ARM_Ret, true); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +{ + if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... + { + printf("near reset\n"); + ResetBlockCache(); + } + if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess... + { + printf("far reset\n"); + ResetBlockCache(); + } + + ConstantCycles = 0; + Thumb = thumb; + Num = cpu->Num; + CodeRegion = instrs[0].Addr >> 24; + CurCPU = cpu; + // CPSR might have been modified in a previous block + CPSRDirty = false; + + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); + + RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount); + + for (int i = 0; i < instrsCount; i++) + { + CurInstr = instrs[i]; + R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; + + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + + CompileFunc comp = Thumb + ? T_Comp[CurInstr.Info.Kind] + : A_Comp[CurInstr.Info.Kind]; + + bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + { + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); + if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + + SaveCPSR(); + } + } + + if (comp != NULL) + RegCache.Prepare(Thumb, i); + else + RegCache.Flush(); + + if (Thumb) + { + if (comp == NULL) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); + } + else + (this->*comp)(); + } + else + { + u32 cond = CurInstr.Cond(); + if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + if (comp) + (this->*comp)(); + else + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + ABI_CallFunction(ARMInterpreter::A_BLX_IMM); + } + } + else if (cond == 0xF) + { + Comp_AddCycles_C(); + } + else + { + IrregularCycles = false; + + FixupBranch skipExecute; + if (cond < 0xE) + skipExecute = CheckCondition(cond); + + if (comp == NULL) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); + } + else + (this->*comp)(); + + Comp_SpecialBranchBehaviour(true); + + if (CurInstr.Cond() < 0xE) + { + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) + { + FixupBranch skipFailed = J(); + SetJumpTarget(skipExecute); + + Comp_AddCycles_C(true); + + Comp_SpecialBranchBehaviour(false); + + SetJumpTarget(skipFailed); + } + else + SetJumpTarget(skipExecute); + } + + } + } + + if (comp == NULL) + LoadCPSR(); + } + + RegCache.Flush(); + + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + JMP((u8*)ARM_Ret, true); + + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + + return res; +} + +void Compiler::Comp_AddCycles_C(bool forceNonConstant) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant) + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + +void Compiler::Comp_AddCycles_CI(u32 i) +{ + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; + + if (!Thumb && CurInstr.Cond() < 0xE) + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + +void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + { + LEA(32, RSCRATCH, MDisp(i, add + cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + } + else + { + ConstantCycles += cycles; + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); + } +} + +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if ((CurInstr.DataRegion >> 4) == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h new file mode 100644 index 0000000..0fe0147 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -0,0 +1,255 @@ +#ifndef ARMJIT_COMPILER_H +#define ARMJIT_COMPILER_H + +#include "../dolphin/x64Emitter.h" + +#include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" +#include "../ARMJIT_RegisterCache.h" + +#include <unordered_map> + +namespace ARMJIT +{ + +const Gen::X64Reg RCPU = Gen::RBP; +const Gen::X64Reg RCPSR = Gen::R15; + +const Gen::X64Reg RSCRATCH = Gen::EAX; +const Gen::X64Reg RSCRATCH2 = Gen::EDX; +const Gen::X64Reg RSCRATCH3 = Gen::ECX; +const Gen::X64Reg RSCRATCH4 = Gen::R8; + +struct LoadStorePatch +{ + void* PatchFunc; + s16 Offset; + u16 Size; +}; + +struct Op2 +{ + Op2() + {} + + Op2(u32 imm) + : IsImm(true), Imm(imm) + {} + Op2(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; + +class Compiler : public Gen::XEmitter +{ +public: + Compiler(); + + void Reset(); + + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + + void LoadReg(int reg, Gen::X64Reg nativeReg); + void SaveReg(int reg, Gen::X64Reg nativeReg); + + bool CanCompile(bool thumb, u16 kind); + + typedef void (Compiler::*CompileFunc)(); + + void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); + void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); + + void Comp_AddCycles_C(bool forceNonConstant = false); + void Comp_AddCycles_CI(u32 i); + void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); + + enum + { + opSetsFlags = 1 << 0, + opSymmetric = 1 << 1, + opRetriveCV = 1 << 2, + opInvertCarry = 1 << 3, + opSyncCarry = 1 << 4, + opInvertOp2 = 1 << 5, + }; + + void Nop() {} + + void A_Comp_Arith(); + void A_Comp_MovOp(); + void A_Comp_CmpOp(); + + void A_Comp_MUL_MLA(); + void A_Comp_Mul_Long(); + + void A_Comp_CLZ(); + + void A_Comp_MemWB(); + void A_Comp_MemHalf(); + void A_Comp_LDM_STM(); + + void A_Comp_BranchImm(); + void A_Comp_BranchXchangeReg(); + + void A_Comp_MRS(); + void A_Comp_MSR(); + + void T_Comp_ShiftImm(); + void T_Comp_AddSub_(); + void T_Comp_ALU_Imm8(); + void T_Comp_ALU(); + void T_Comp_ALU_HiReg(); + void T_Comp_MUL(); + + void T_Comp_RelAddr(); + void T_Comp_AddSP(); + + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void T_Comp_MemRegHalf(); + void T_Comp_MemImmHalf(); + void T_Comp_LoadPCRel(); + void T_Comp_MemSPRel(); + void T_Comp_PUSH_POP(); + void T_Comp_LDMIA_STMIA(); + + void T_Comp_BCOND(); + void T_Comp_B(); + void T_Comp_BranchXchangeReg(); + void T_Comp_BL_LONG_1(); + void T_Comp_BL_LONG_2(); + void T_Comp_BL_Merged(); + + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags); + s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + + void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); + void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); + void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed); + + void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn); + + void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + + void Comp_SpecialBranchBehaviour(bool taken); + + + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); + Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); + + Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); + + void LoadCPSR(); + void SaveCPSR(bool flagClean = true); + + bool FlagsNZRequired() + { return CurInstr.SetFlags & 0xC; } + + Gen::FixupBranch CheckCondition(u32 cond); + + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + + Gen::OpArg MapReg(int reg) + { + if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG) + return Gen::Imm32(R15); + + assert(RegCache.Mapping[reg] != Gen::INVALID_REG); + return Gen::R(RegCache.Mapping[reg]); + } + + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(ResetStart + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - ResetStart; + } + + void SwitchToNearCode() + { + FarCode = GetWritableCodePtr(); + SetCodePtr(NearCode); + } + + void SwitchToFarCode() + { + NearCode = GetWritableCodePtr(); + SetCodePtr(FarCode); + } + + bool IsJITFault(u64 addr); + + s32 RewriteMemAccess(u64 pc); + + u8* FarCode; + u8* NearCode; + u32 FarSize; + u32 NearSize; + + u8* NearStart; + u8* FarStart; + + void* PatchedStoreFuncs[2][2][3][16]; + void* PatchedLoadFuncs[2][2][3][2][16]; + + std::unordered_map<u8*, LoadStorePatch> LoadStorePatches; + + u8* ResetStart; + u32 CodeMemSize; + + bool Exit; + bool IrregularCycles; + + void* ReadBanked; + void* WriteBanked; + + bool CPSRDirty = false; + + FetchedInstr CurInstr; + + RegisterCache<Compiler, Gen::X64Reg> RegCache; + + bool Thumb; + u32 Num; + u32 R15; + u32 CodeRegion; + + u32 ConstantCycles; + + ARM* CurCPU; +}; + +} + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp new file mode 100644 index 0000000..9696d22 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp @@ -0,0 +1,15 @@ +#include "../ARM.h" + +int main(int argc, char* argv[]) +{ + FILE* f = fopen("ARMJIT_Offsets.h", "w"); +#define writeOffset(field) \ + fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field)) + + writeOffset(CPSR); + writeOffset(Cycles); + writeOffset(StopExecution); + + fclose(f); + return 0; +}
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s new file mode 100644 index 0000000..0a84df0 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Linkage.s @@ -0,0 +1,78 @@ +.intel_syntax noprefix + +#include "ARMJIT_Offsets.h" + +.text + +#define RCPU rbp +#define RCPSR r15d + +#ifdef WIN64 +#define ARG1_REG ecx +#define ARG2_REG edx +#define ARG3_REG r8d +#define ARG4_REG r9d +#define ARG1_REG64 rcx +#define ARG2_REG64 rdx +#define ARG3_REG64 r8 +#define ARG4_REG64 r9 +#else +#define ARG1_REG edi +#define ARG2_REG esi +#define ARG3_REG edx +#define ARG4_REG ecx +#define ARG1_REG64 rdi +#define ARG2_REG64 rsi +#define ARG3_REG64 rdx +#define ARG4_REG64 rcx +#endif + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: +#ifdef WIN64 + push rdi + push rsi +#endif + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + +#ifdef WIN64 + sub rsp, 0x28 +#else + sub rsp, 0x8 +#endif + mov RCPU, ARG1_REG64 + mov RCPSR, [RCPU + ARM_CPSR_offset] + + jmp ARG2_REG64 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + mov [RCPU + ARM_CPSR_offset], RCPSR + +#ifdef WIN64 + add rsp, 0x28 +#else + add rsp, 0x8 +#endif + + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx +#ifdef WIN64 + pop rsi + pop rdi +#endif + + ret diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp new file mode 100644 index 0000000..2da113b --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -0,0 +1,773 @@ +#include "ARMJIT_Compiler.h" + +#include "../Config.h" + +using namespace Gen; + +namespace ARMJIT +{ + +template <typename T> +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} + +s32 Compiler::RewriteMemAccess(u64 pc) +{ + auto it = LoadStorePatches.find((u8*)pc); + if (it != LoadStorePatches.end()) + { + LoadStorePatch patch = it->second; + LoadStorePatches.erase(it); + + u8* curCodePtr = GetWritableCodePtr(); + u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset; + SetCodePtr(rewritePtr); + + CALL(patch.PatchFunc); + u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr); + if (remainingSize > 0) + NOP(remainingSize); + + //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size); + + SetCodePtr(curCodePtr); + + return patch.Offset; + } + + printf("this is a JIT bug %x\n", pc); + abort(); +} + +/* + According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) + of all memory load and store instructions always access addresses in the same region as + during the their first execution. + + I tried multiple optimisations, which would benefit from this behaviour + (having fast paths for the first region, …), though none of them yielded a measureable + improvement. +*/ + +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) +{ + u32 localAddr = LocaliseCodeAddress(Num, addr); + + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); + if (invalidLiteralIdx != -1) + { + InvalidLiterals.Remove(invalidLiteralIdx); + return false; + } + + Comp_AddCycles_CDI(); + + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + { + CurCPU->DataRead16(addr & ~0x1, &val); + if (signExtend) + val = ((s32)val << 16) >> 16; + } + else + { + CurCPU->DataRead8(addr, &val); + if (signExtend) + val = ((s32)val << 24) >> 24; + } + CurCPU->R[15] = tmpR15; + + MOV(32, MapReg(rd), Imm32(val)); + + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + return true; +} + + +void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags) +{ + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) + return; + } + + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } + + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + OpArg rdMapped = MapReg(rd); + + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); + + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) + { + MOV(32, R(RSCRATCH3), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) + { + u8* memopStart = GetWritableCodePtr(); + LoadStorePatch patch; + + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()] + : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()]; + + assert(patch.PatchFunc != NULL); + + MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); + + X64Reg maskedAddr = RSCRATCH3; + if (size > 8) + { + maskedAddr = RSCRATCH2; + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + AND(32, R(RSCRATCH2), Imm8(addressMask)); + } + + u8* memopLoadStoreLocation = GetWritableCodePtr(); + if (flags & memop_Store) + { + MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); + + if (size == 32) + { + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); + } + } + + patch.Offset = memopStart - memopLoadStoreLocation; + patch.Size = GetWritableCodePtr() - memopStart; + + assert(patch.Size >= 5); + + LoadStorePatches[memopLoadStoreLocation] = patch; + } + else + { + PushRegs(false); + + if (Num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); + + switch (size | NDS::ConsoleType) + { + case 32: CALL((void*)&SlowWrite9<u32, 0>); break; + case 16: CALL((void*)&SlowWrite9<u16, 0>); break; + case 8: CALL((void*)&SlowWrite9<u8, 0>); break; + case 33: CALL((void*)&SlowWrite9<u32, 1>); break; + case 17: CALL((void*)&SlowWrite9<u16, 1>); break; + case 9: CALL((void*)&SlowWrite9<u8, 1>); break; + } + } + else + { + switch (size | NDS::ConsoleType) + { + case 32: CALL((void*)&SlowRead9<u32, 0>); break; + case 16: CALL((void*)&SlowRead9<u16, 0>); break; + case 8: CALL((void*)&SlowRead9<u8, 0>); break; + case 33: CALL((void*)&SlowRead9<u32, 1>); break; + case 17: CALL((void*)&SlowRead9<u16, 1>); break; + case 9: CALL((void*)&SlowRead9<u8, 1>); break; + } + } + } + else + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); + + switch (size | NDS::ConsoleType) + { + case 32: CALL((void*)&SlowWrite7<u32, 0>); break; + case 16: CALL((void*)&SlowWrite7<u16, 0>); break; + case 8: CALL((void*)&SlowWrite7<u8, 0>); break; + case 33: CALL((void*)&SlowWrite7<u32, 1>); break; + case 17: CALL((void*)&SlowWrite7<u16, 1>); break; + case 9: CALL((void*)&SlowWrite7<u8, 1>); break; + } + } + else + { + switch (size | NDS::ConsoleType) + { + case 32: CALL((void*)&SlowRead7<u32, 0>); break; + case 16: CALL((void*)&SlowRead7<u16, 0>); break; + case 8: CALL((void*)&SlowRead7<u8, 0>); break; + case 33: CALL((void*)&SlowRead7<u32, 1>); break; + case 17: CALL((void*)&SlowRead7<u16, 1>); break; + case 9: CALL((void*)&SlowRead7<u8, 1>); break; + } + } + } + + PopRegs(false); + + if (!(flags & memop_Store)) + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + { + if (Thumb) + OR(32, rdMapped, Imm8(0x1)); + else + AND(32, rdMapped, Imm8(0xFE)); + } + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } +} + +s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + int regsCount = regs.Count(); + + if (regsCount == 0) + return 0; // actually not the right behaviour TODO: fix me + + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) + { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement && preinc) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + + s32 offset = (regsCount * 4) * (decrement ? -1 : 1); + + int expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + if (!store) + Comp_AddCycles_CDI(); + else + Comp_AddCycles_CD(); + + bool compileFastPath = Config::JIT_FastMemory + && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); + + // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#else + u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; + + if (decrement) + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4))); + else + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0)); + + if (compileFastPath) + { + AND(32, R(RSCRATCH4), Imm8(~3)); + + u8* fastPathStart = GetWritableCodePtr(); + u8* firstLoadStoreAddr; + + bool firstLoadStore = true; + + MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); + ADD(64, R(RSCRATCH2), R(RSCRATCH4)); + MOV(32, R(RSCRATCH3), R(RSCRATCH4)); + + u32 offset = 0; + for (int reg : regs) + { + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); + + OpArg mem = MDisp(RSCRATCH2, offset); + if (store) + { + if (RegCache.LoadedRegs & (1 << reg)) + { + MOV(32, mem, MapReg(reg)); + } + else + { + LoadReg(reg, RSCRATCH); + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); + MOV(32, mem, R(RSCRATCH)); + } + } + else + { + if (RegCache.LoadedRegs & (1 << reg)) + { + MOV(32, MapReg(reg), mem); + } + else + { + MOV(32, R(RSCRATCH), mem); + SaveReg(reg, RSCRATCH); + } + } + offset += 4; + + firstLoadStore = false; + } + + LoadStorePatch patch; + patch.Size = GetWritableCodePtr() - fastPathStart; + patch.Offset = fastPathStart - firstLoadStoreAddr; + SwitchToFarCode(); + patch.PatchFunc = GetWritableCodePtr(); + + LoadStorePatches[firstLoadStoreAddr] = patch; + } + + if (!store) + { + PushRegs(false); + + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + if (allocOffset == 0) + MOV(64, R(ABI_PARAM2), R(RSP)); + else + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); + + switch (Num * 2 | NDS::ConsoleType) + { + case 0: CALL((void*)&SlowBlockTransfer9<false, 0>); break; + case 1: CALL((void*)&SlowBlockTransfer9<false, 1>); break; + case 2: CALL((void*)&SlowBlockTransfer7<false, 0>); break; + case 3: CALL((void*)&SlowBlockTransfer7<false, 1>); break; + } + + PopRegs(false); + + if (allocOffset) + ADD(64, R(RSP), Imm8(allocOffset)); + + bool firstUserMode = true; + for (int reg : regs) + { + if (usermode && !regs[15] && reg >= 8 && reg < 15) + { + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + POP(RSCRATCH3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.LoadedRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); + else + SaveReg(reg, RSCRATCH3); + SetJumpTarget(sucessfulWritten); + } + else if (!(RegCache.LoadedRegs & (1 << reg))) + { + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); + } + else + { + POP(MapReg(reg).GetSimpleReg()); + } + } + } + else + { + bool firstUserMode = true; + for (int reg = 15; reg >= 0; reg--) + { + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, RSCRATCH3); + else + MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(RSCRATCH3); + } + else if (!(RegCache.LoadedRegs & (1 << reg))) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + { + PUSH(MapReg(reg).GetSimpleReg()); + } + } + } + + if (allocOffset) + SUB(64, R(RSP), Imm8(allocOffset)); + + PushRegs(false); + + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + if (allocOffset) + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + else + MOV(64, R(ABI_PARAM2), R(RSP)); + + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); + + switch (Num * 2 | NDS::ConsoleType) + { + case 0: CALL((void*)&SlowBlockTransfer9<true, 0>); break; + case 1: CALL((void*)&SlowBlockTransfer9<true, 1>); break; + case 2: CALL((void*)&SlowBlockTransfer7<true, 0>); break; + case 3: CALL((void*)&SlowBlockTransfer7<true, 1>); break; + } + + ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + + PopRegs(false); + } + + if (compileFastPath) + { + RET(); + SwitchToNearCode(); + } + + if (!store && regs[15]) + { + if (Num == 1) + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + + return offset; +} + + +void Compiler::A_Comp_MemWB() +{ + bool load = CurInstr.Instr & (1 << 20); + bool byte = CurInstr.Instr & (1 << 22); + int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + + Op2 offset; + if (!(CurInstr.Instr & (1 << 25))) + { + offset = Op2(CurInstr.Instr & 0xFFF); + } + else + { + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); + + offset = Op2(rm, op, amount); + } + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); +} + +void Compiler::A_Comp_MemHalf() +{ + Op2 offset = CurInstr.Instr & (1 << 22) + ? Op2(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : Op2(CurInstr.A_Reg(0), 0, 0); + + int op = (CurInstr.Instr >> 5) & 0x3; + bool load = CurInstr.Instr & (1 << 20); + + bool signExtend = false; + int size; + if (!load) + { + size = op == 1 ? 16 : 32; + load = op == 2; + } + else if (load) + { + size = op == 2 ? 8 : 16; + signExtend = op > 1; + } + + if (size == 32 && Num == 1) + return; // NOP + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); +} + +void Compiler::T_Comp_MemReg() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + bool byte = op & 0x1; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::A_Comp_LDM_STM() +{ + BitSet16 regs(CurInstr.Instr & 0xFFFF); + + bool load = CurInstr.Instr & (1 << 20); + bool pre = CurInstr.Instr & (1 << 24); + bool add = CurInstr.Instr & (1 << 23); + bool writeback = CurInstr.Instr & (1 << 21); + bool usermode = CurInstr.Instr & (1 << 22); + + OpArg rn = MapReg(CurInstr.A_Reg(16)); + + s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode); + + if (load && writeback && regs[CurInstr.A_Reg(16)]) + writeback = Num == 0 + ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1)) + : false; + if (writeback) + ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset)); +} + +void Compiler::T_Comp_MemImm() +{ + int op = (CurInstr.Instr >> 11) & 0x3; + bool load = op & 0x1; + bool byte = op & 0x2; + u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), + byte ? 8 : 32, load ? 0 : memop_Store); +} + +void Compiler::T_Comp_MemRegHalf() +{ + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op != 0; + int size = op != 1 ? 16 : 8; + bool signExtend = op & 1; + + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), + size, flags); +} + +void Compiler::T_Comp_MemImmHalf() +{ + u32 offset = (CurInstr.Instr >> 5) & 0x3E; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16, + load ? 0 : memop_Store); +} + +void Compiler::T_Comp_LoadPCRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); +} + +void Compiler::T_Comp_MemSPRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) * 4; + bool load = CurInstr.Instr & (1 << 11); + + Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, + load ? 0 : memop_Store); +} + +void Compiler::T_Comp_PUSH_POP() +{ + bool load = CurInstr.Instr & (1 << 11); + BitSet16 regs(CurInstr.Instr & 0xFF); + if (CurInstr.Instr & (1 << 8)) + { + if (load) + regs[15] = true; + else + regs[14] = true; + } + + OpArg sp = MapReg(13); + s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false); + + ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max +} + +void Compiler::T_Comp_LDMIA_STMIA() +{ + BitSet16 regs(CurInstr.Instr & 0xFF); + OpArg rb = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + + s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false); + + if (!load || !regs[CurInstr.T_Reg(8)]) + ADD(32, rb, Imm8(offset)); +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h new file mode 100644 index 0000000..a73dd59 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Offsets.h @@ -0,0 +1,3 @@ +#define ARM_CPSR_offset 0x64 +#define ARM_Cycles_offset 0xc +#define ARM_StopExecution_offset 0x10 diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp new file mode 100644 index 0000000..ccec951 --- /dev/null +++ b/src/ARM_InstrInfo.cpp @@ -0,0 +1,539 @@ +#include "ARM_InstrInfo.h" + +#include <stdio.h> + +#include "Config.h" + +namespace ARMInstrInfo +{ + +#define ak(x) ((x) << 22) + +enum { + A_Read0 = 1 << 0, + A_Read16 = 1 << 1, + A_Read8 = 1 << 2, + A_Read12 = 1 << 3, + + A_Write12 = 1 << 4, + A_Write16 = 1 << 5, + A_MemWriteback = 1 << 6, + + A_BranchAlways = 1 << 7, + + // for STRD/LDRD + A_Read12Double = 1 << 8, + A_Write12Double = 1 << 9, + + A_Link = 1 << 10, + + A_UnkOnARM7 = 1 << 11, + + A_SetNZ = 1 << 12, + A_SetCV = 1 << 13, + A_SetMaybeC = 1 << 14, + A_MulFlags = 1 << 15, + A_ReadC = 1 << 16, + A_RRXReadC = 1 << 17, + A_StaticShiftSetC = 1 << 18, + A_SetC = 1 << 19, + + A_WriteMem = 1 << 20, + A_LoadMem = 1 << 21 +}; + +#define A_BIOP A_Read16 +#define A_MONOOP 0 + +#define A_ARITH_LSL_IMM A_SetCV +#define A_LOGIC_LSL_IMM A_StaticShiftSetC +#define A_ARITH_SHIFT_IMM A_SetCV +#define A_LOGIC_SHIFT_IMM A_SetC +#define A_ARITH_SHIFT_REG A_SetCV +#define A_LOGIC_SHIFT_REG A_SetMaybeC +#define A_ARITH_IMM A_SetCV +#define A_LOGIC_IMM 0 + +#define A_IMPLEMENT_ALU_OP(x,k,a,c) \ + const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ + \ + const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ + const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a##_LSL_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ + const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ + const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ + const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ + const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ + const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ + const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ + const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); + +A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0) + +const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM; + +#define A_IMPLEMENT_ALU_TEST(x,a) \ + const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a##_IMM | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a##_LSL_IMM | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); + +A_IMPLEMENT_ALU_TEST(TST,LOGIC) +A_IMPLEMENT_ALU_TEST(TEQ,LOGIC) +A_IMPLEMENT_ALU_TEST(CMP,ARITH) +A_IMPLEMENT_ALU_TEST(CMN,ARITH) + +const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL); +const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA); +const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL); +const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); +const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); +const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); +const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); +const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); +const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); +const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy); + +const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ); + +const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD); +const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB); +const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); +const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); + +#define A_LDR A_Write12 | A_LoadMem +#define A_STR A_Read12 | A_WriteMem + +#define A_IMPLEMENT_WB_LDRSTR(x,k) \ + const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \ + const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \ + const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \ + const u32 A_##x##_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \ + \ + const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \ + const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \ + const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \ + const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \ + const u32 A_##x##_POST_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR); + +A_IMPLEMENT_WB_LDRSTR(STR,STR) +A_IMPLEMENT_WB_LDRSTR(STRB,STR) +A_IMPLEMENT_WB_LDRSTR(LDR,LDR) +A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) + +#define A_LDRD A_Write12Double | A_LoadMem +#define A_STRD A_Read12Double | A_WriteMem + +#define A_IMPLEMENT_HD_LDRSTR(x,k) \ + const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG); \ + const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \ + const u32 A_##x##_POST_REG = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG); + +A_IMPLEMENT_HD_LDRSTR(STRH,STR) +A_IMPLEMENT_HD_LDRSTR(LDRD,LDRD) +A_IMPLEMENT_HD_LDRSTR(STRD,STRD) +A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) +A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) +A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) + +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWPB); + +const u32 A_LDM = A_Read16 | A_MemWriteback | A_LoadMem | ak(ak_LDM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); + +const u32 A_B = A_BranchAlways | ak(ak_B); +const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); +const u32 A_BLX_IMM = A_BranchAlways | A_Link | ak(ak_BLX_IMM); +const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); +const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); + +const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); +const u32 A_MSR_IMM = ak(ak_MSR_IMM); +const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG); +const u32 A_MRS = A_Write12 | ak(ak_MRS); +const u32 A_MCR = A_Read12 | ak(ak_MCR); +const u32 A_MRC = A_Write12 | ak(ak_MRC); +const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); + +// THUMB + +#define tk(x) ((x) << 22) + +enum { + T_Read0 = 1 << 0, + T_Read3 = 1 << 1, + T_Read6 = 1 << 2, + T_Read8 = 1 << 3, + + T_Write0 = 1 << 4, + T_Write8 = 1 << 5, + + T_ReadHi0 = 1 << 6, + T_ReadHi3 = 1 << 7, + T_WriteHi0 = 1 << 8, + + T_ReadR13 = 1 << 9, + T_WriteR13 = 1 << 10, + + T_BranchAlways = 1 << 12, + T_ReadR14 = 1 << 13, + T_WriteR14 = 1 << 14, + + T_SetNZ = 1 << 15, + T_SetCV = 1 << 16, + T_SetMaybeC = 1 << 17, + T_ReadC = 1 << 18, + T_SetC = 1 << 19, + + T_WriteMem = 1 << 20, + T_LoadMem = 1 << 21, +}; + +const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); +const u32 T_LSR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_LSR_IMM); +const u32 T_ASR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_ASR_IMM); + +const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); +const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); +const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_); +const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_); + +const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM); +const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Read8 | tk(tk_CMP_IMM); +const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM); +const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM); + +const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG); +const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG); +const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG); +const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG); +const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG); +const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG); +const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG); +const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG); +const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG); +const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG); +const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG); +const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG); +const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG); +const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG); +const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG); +const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG); + +const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG); +const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); +const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG); + +const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); +const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); +const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); + +const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL); + +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); +const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG); +const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); +const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG); +const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG); +const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG); + +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); +const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); +const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); +const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM); + +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); +const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL); + +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); +const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP); + +const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); + +const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); +const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); +const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG); +const u32 T_B = T_BranchAlways | tk(tk_B); +const u32 T_BL_LONG_1 = T_WriteR14 | tk(tk_BL_LONG_1); +const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | tk(tk_BL_LONG_2); + +const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK); +const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC); + +#define INSTRFUNC_PROTO(x) u32 x +#include "ARM_InstrTable.h" +#undef INSTRFUNC_PROTO + +Info Decode(bool thumb, u32 num, u32 instr) +{ + const u8 FlagsReadPerCond[7] = { + flag_Z, + flag_C, + flag_N, + flag_V, + flag_C | flag_Z, + flag_N | flag_V, + flag_Z | flag_N | flag_V}; + + Info res = {0}; + if (thumb) + { + u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; + res.Kind = (data >> 22) & 0x3F; + + if (data & T_Read0) + res.SrcRegs |= 1 << (instr & 0x7); + if (data & T_Read3) + res.SrcRegs |= 1 << ((instr >> 3) & 0x7); + if (data & T_Read6) + res.SrcRegs |= 1 << ((instr >> 6) & 0x7); + if (data & T_Read8) + res.SrcRegs |= 1 << ((instr >> 8) & 0x7); + + if (data & T_Write0) + res.DstRegs |= 1 << (instr & 0x7); + if (data & T_Write8) + res.DstRegs |= 1 << ((instr >> 8) & 0x7); + + if (data & T_ReadHi0) + res.SrcRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8)); + if (data & T_ReadHi3) + res.SrcRegs |= 1 << ((instr >> 3) & 0xF); + if (data & T_WriteHi0) + res.DstRegs |= 1 << ((instr & 0x7) | ((instr >> 4) & 0x8)); + + if (data & T_ReadR13) + res.SrcRegs |= (1 << 13); + if (data & T_WriteR13) + res.DstRegs |= (1 << 13); + if (data & T_WriteR14) + res.DstRegs |= (1 << 14); + if (data & T_ReadR14) + res.SrcRegs |= (1 << 14); + + if (data & T_BranchAlways) + res.DstRegs |= (1 << 15); + + if (res.Kind == tk_POP && instr & (1 << 8)) + res.DstRegs |= 1 << 15; + + if (data & T_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & T_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & T_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if (data & T_ReadC) + res.ReadFlags |= flag_C; + if (data & T_SetC) + res.WriteFlags |= flag_C; + + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + + if (data & T_LoadMem) + { + if (res.Kind == tk_LDR_PCREL) + { + if (!Config::JIT_LiteralOptimisations) + res.SrcRegs |= 1 << 15; + res.SpecialKind = special_LoadLiteral; + } + else + { + res.SpecialKind = special_LoadMem; + } + } + + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF); + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); + res.SrcRegs |= set; + } + + res.EndBlock |= res.Branches(); + + if (res.Kind == tk_BCOND) + res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7]; + + return res; + } + else + { + u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)]; + if (num == 0 && (instr & 0xFE000000) == 0xFA000000) + data = A_BLX_IMM; + else if ((instr >> 28) == 0xF) + data = ak(ak_Nop); + + if (data & A_UnkOnARM7 && num == 1) + data = A_UNK; + + res.Kind = (data >> 22) & 0x1FF; + + if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1) + { + data = ak(ak_Nop); + res.Kind = ak_Nop; + } + + if (res.Kind == ak_MCR) + { + u32 cn = (instr >> 16) & 0xF; + u32 cm = instr & 0xF; + u32 cpinfo = (instr >> 5) & 0x7; + u32 id = (cn<<8)|(cm<<4)|cpinfo; + if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) + res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; + } + if (res.Kind == ak_MCR || res.Kind == ak_MRC) + { + u32 cp = ((instr >> 8) & 0xF); + if ((num == 0 && cp != 15) || (num == 1 && cp != 14)) + { + data = A_UNK; + res.Kind = ak_UNK; + } + } + if (res.Kind == ak_MRS && !(instr & (1 << 22))) + res.ReadFlags |= flag_N | flag_Z | flag_C | flag_V; + if ((res.Kind == ak_MSR_IMM || res.Kind == ak_MSR_REG) && instr & (1 << 19)) + res.WriteFlags |= flag_N | flag_Z | flag_C | flag_V; + + if (data & A_Read0) + res.SrcRegs |= 1 << (instr & 0xF); + if (data & A_Read16) + res.SrcRegs |= 1 << ((instr >> 16) & 0xF); + if (data & A_Read8) + res.SrcRegs |= 1 << ((instr >> 8) & 0xF); + if (data & A_Read12) + res.SrcRegs |= 1 << ((instr >> 12) & 0xF); + + if (data & A_Write12) + res.DstRegs |= 1 << ((instr >> 12) & 0xF); + if (data & A_Write16) + res.DstRegs |= 1 << ((instr >> 16) & 0xF); + + if (data & A_MemWriteback && instr & (1 << 21)) + res.DstRegs |= 1 << ((instr >> 16) & 0xF); + + if (data & A_BranchAlways) + res.DstRegs |= 1 << 15; + + if (data & A_Read12Double) + { + res.SrcRegs |= 1 << ((instr >> 12) & 0xF); + res.SrcRegs |= 1 << (((instr >> 12) & 0xF) + 1); + } + if (data & A_Write12Double) + { + res.DstRegs |= 1 << ((instr >> 12) & 0xF); + res.DstRegs |= 1 << (((instr >> 12) & 0xF) + 1); + } + + if (data & A_Link) + res.DstRegs |= 1 << 14; + + if (res.Kind == ak_LDM) + res.DstRegs |= instr & (1 << 15); // this is right + + if (res.Kind == ak_STM) + res.SrcRegs |= instr & (1 << 15); + + if (data & A_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & A_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if ((data & A_MulFlags) && (instr & (1 << 20))) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_ReadC) + res.ReadFlags |= flag_C; + if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F)) + res.ReadFlags |= flag_C; + if ((data & A_SetC) || ((data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))) + res.WriteFlags |= flag_C; + + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + + if (data & A_LoadMem) + { + if (res.SrcRegs == (1 << 15)) + res.SpecialKind = special_LoadLiteral; + else + res.SpecialKind = special_LoadMem; + } + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + } + + if ((instr >> 28) < 0xE) + { + // make non conditional flag sets conditional + res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0; + res.ReadFlags |= FlagsReadPerCond[instr >> 29]; + } + + res.EndBlock |= res.Branches(); + + return res; + } +} + +} diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h new file mode 100644 index 0000000..a702435 --- /dev/null +++ b/src/ARM_InstrInfo.h @@ -0,0 +1,263 @@ +#ifndef ARMINSTRINFO_H +#define ARMINSTRINFO_H + +#include "types.h" + +namespace ARMInstrInfo +{ + +// Instruction kinds, for faster dispatch + +#define ak_ALU(n) \ + ak_##n##_REG_LSL_IMM, \ + ak_##n##_REG_LSR_IMM, \ + ak_##n##_REG_ASR_IMM, \ + ak_##n##_REG_ROR_IMM, \ + \ + ak_##n##_REG_LSL_REG, \ + ak_##n##_REG_LSR_REG, \ + ak_##n##_REG_ASR_REG, \ + ak_##n##_REG_ROR_REG, \ + \ + ak_##n##_IMM, \ + \ + ak_##n##_REG_LSL_IMM_S, \ + ak_##n##_REG_LSR_IMM_S, \ + ak_##n##_REG_ASR_IMM_S, \ + ak_##n##_REG_ROR_IMM_S, \ + \ + ak_##n##_REG_LSL_REG_S, \ + ak_##n##_REG_LSR_REG_S, \ + ak_##n##_REG_ASR_REG_S, \ + ak_##n##_REG_ROR_REG_S, \ + \ + ak_##n##_IMM_S \ + +#define ak_Test(n) \ + ak_##n##_REG_LSL_IMM, \ + ak_##n##_REG_LSR_IMM, \ + ak_##n##_REG_ASR_IMM, \ + ak_##n##_REG_ROR_IMM, \ + \ + ak_##n##_REG_LSL_REG, \ + ak_##n##_REG_LSR_REG, \ + ak_##n##_REG_ASR_REG, \ + ak_##n##_REG_ROR_REG, \ + \ + ak_##n##_IMM + +#define ak_WB_LDRSTR(n) \ + ak_##n##_REG_LSL, \ + ak_##n##_REG_LSR, \ + ak_##n##_REG_ASR, \ + ak_##n##_REG_ROR, \ + \ + ak_##n##_IMM, \ + \ + ak_##n##_POST_REG_LSL, \ + ak_##n##_POST_REG_LSR, \ + ak_##n##_POST_REG_ASR, \ + ak_##n##_POST_REG_ROR, \ + \ + ak_##n##_POST_IMM + +#define ak_HD_LDRSTR(n) \ + ak_##n##_REG, \ + ak_##n##_IMM, \ + \ + ak_##n##_POST_REG, \ + ak_##n##_POST_IMM + +enum +{ + ak_ALU(AND), + ak_ALU(EOR), + ak_ALU(SUB), + ak_ALU(RSB), + ak_ALU(ADD), + ak_ALU(ADC), + ak_ALU(SBC), + ak_ALU(RSC), + ak_ALU(ORR), + ak_ALU(MOV), + ak_ALU(BIC), + ak_ALU(MVN), + + ak_Test(TST), + ak_Test(TEQ), + ak_Test(CMP), + ak_Test(CMN), + + ak_MUL, + ak_MLA, + ak_UMULL, + ak_UMLAL, + ak_SMULL, + ak_SMLAL, + ak_SMLAxy, + ak_SMLAWy, + ak_SMULWy, + ak_SMLALxy, + ak_SMULxy, + + ak_CLZ, + + ak_QADD, + ak_QSUB, + ak_QDADD, + ak_QDSUB, + + ak_WB_LDRSTR(STR), + ak_WB_LDRSTR(STRB), + ak_WB_LDRSTR(LDR), + ak_WB_LDRSTR(LDRB), + + ak_HD_LDRSTR(STRH), + ak_HD_LDRSTR(LDRD), + ak_HD_LDRSTR(STRD), + ak_HD_LDRSTR(LDRH), + ak_HD_LDRSTR(LDRSB), + ak_HD_LDRSTR(LDRSH), + + ak_SWP, + ak_SWPB, + + ak_LDM, + ak_STM, + + ak_B, + ak_BL, + ak_BLX_IMM, + ak_BX, + ak_BLX_REG, + + ak_UNK, + ak_MSR_IMM, + ak_MSR_REG, + ak_MRS, + ak_MCR, + ak_MRC, + ak_SVC, + + ak_Nop, + + ak_Count, + + tk_LSL_IMM = 0, + tk_LSR_IMM, + tk_ASR_IMM, + + tk_ADD_REG_, + tk_SUB_REG_, + tk_ADD_IMM_, + tk_SUB_IMM_, + + tk_MOV_IMM, + tk_CMP_IMM, + tk_ADD_IMM, + tk_SUB_IMM, + + tk_AND_REG, + tk_EOR_REG, + tk_LSL_REG, + tk_LSR_REG, + tk_ASR_REG, + tk_ADC_REG, + tk_SBC_REG, + tk_ROR_REG, + tk_TST_REG, + tk_NEG_REG, + tk_CMP_REG, + tk_CMN_REG, + tk_ORR_REG, + tk_MUL_REG, + tk_BIC_REG, + tk_MVN_REG, + + tk_ADD_HIREG, + tk_CMP_HIREG, + tk_MOV_HIREG, + + tk_ADD_PCREL, + tk_ADD_SPREL, + tk_ADD_SP, + + tk_LDR_PCREL, + tk_STR_REG, + tk_STRB_REG, + tk_LDR_REG, + tk_LDRB_REG, + tk_STRH_REG, + tk_LDRSB_REG, + tk_LDRH_REG, + tk_LDRSH_REG, + tk_STR_IMM, + tk_LDR_IMM, + tk_STRB_IMM, + tk_LDRB_IMM, + tk_STRH_IMM, + tk_LDRH_IMM, + tk_STR_SPREL, + tk_LDR_SPREL, + + tk_PUSH, + tk_POP, + tk_LDMIA, + tk_STMIA, + + tk_BCOND, + tk_BX, + tk_BLX_REG, + tk_B, + tk_BL_LONG_1, + tk_BL_LONG_2, + tk_UNK, + tk_SVC, + + // not a real instruction + tk_BL_LONG, + + tk_Count +}; + +enum +{ + flag_N = 1 << 3, + flag_Z = 1 << 2, + flag_C = 1 << 1, + flag_V = 1 << 0, +}; + +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_LoadMem, + special_WaitForInterrupt, + special_LoadLiteral +}; + +struct Info +{ + u16 DstRegs, SrcRegs, NotStrictlyNeeded; + u16 Kind; + + u8 SpecialKind; + + u8 ReadFlags; + // lower 4 bits - set always + // upper 4 bits - might set flag + u8 WriteFlags; + + bool EndBlock; + bool Branches() const + { + return DstRegs & (1 << 15); + } +}; + +Info Decode(bool thumb, u32 num, u32 instr); + +} + +#endif
\ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 32fcac2..84bbc2b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,7 @@ project(core) +set (CMAKE_CXX_STANDARD 14) + add_library(core STATIC ARCodeList.cpp AREngine.cpp @@ -44,10 +46,53 @@ add_library(core STATIC version.h Wifi.cpp WifiAP.cpp - + tiny-AES-c/aes.c + xxhash/xxhash.c ) +if (ENABLE_JIT) + enable_language(ASM) + + target_sources(core PRIVATE + ARM_InstrInfo.cpp + + ARMJIT.cpp + ARMJIT_Memory.cpp + + dolphin/CommonFuncs.cpp + ) + + if (ARCHITECTURE STREQUAL x86_64) + target_sources(core PRIVATE + dolphin/x64ABI.cpp + dolphin/x64CPUDetect.cpp + dolphin/x64Emitter.cpp + + ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp + + ARMJIT_x64/ARMJIT_Linkage.s + ) + set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") + endif() + if (ARCHITECTURE STREQUAL ARM64) + target_sources(core PRIVATE + dolphin/Arm64Emitter.cpp + dolphin/MathUtil.cpp + + ARMJIT_A64/ARMJIT_Compiler.cpp + ARMJIT_A64/ARMJIT_ALU.cpp + ARMJIT_A64/ARMJIT_LoadStore.cpp + ARMJIT_A64/ARMJIT_Branch.cpp + + ARMJIT_A64/ARMJIT_Linkage.s + ) + endif() +endif() + if (WIN32) target_link_libraries(core ole32 comctl32 ws2_32 opengl32) else() diff --git a/src/CP15.cpp b/src/CP15.cpp index d340b9e..992c83f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -21,6 +21,8 @@ #include "NDS.h" #include "DSi.h" #include "ARM.h" +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" // access timing for cached regions @@ -41,8 +43,8 @@ void ARMv5::CP15Reset() DTCMSetting = 0; ITCMSetting = 0; - memset(ITCM, 0, 0x8000); - memset(DTCM, 0, 0x4000); + memset(ITCM, 0, ITCMPhysicalSize); + memset(DTCM, 0, DTCMPhysicalSize); ITCMSize = 0; DTCMBase = 0xFFFFFFFF; @@ -74,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&DTCMSetting); file->Var32(&ITCMSetting); - file->VarArray(ITCM, 0x8000); - file->VarArray(DTCM, 0x4000); + file->VarArray(ITCM, ITCMPhysicalSize); + file->VarArray(DTCM, DTCMPhysicalSize); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -97,18 +99,26 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { + u32 newDTCMBase; + u32 newDTCMSize; if (CP15Control & (1<<16)) { - DTCMBase = DTCMSetting & 0xFFFFF000; - DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); + newDTCMBase = DTCMSetting & 0xFFFFF000; + newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize); } else { - DTCMBase = 0xFFFFFFFF; - DTCMSize = 0; + newDTCMBase = 0xFFFFFFFF; + newDTCMSize = 0; //printf("DTCM disabled\n"); } + if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize) + { + ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize); + DTCMBase = newDTCMBase; + DTCMSize = newDTCMSize; + } } void ARMv5::UpdateITCMSetting() @@ -562,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: ICacheInvalidateAll(); + //Halt(255); return; case 0x751: ICacheInvalidateByAddr(val); + //Halt(255); return; case 0x752: printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + //Halt(255); return; @@ -595,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val) ITCMSetting = val; UpdateITCMSetting(); return; + + case 0xF00: + //printf("cache debug index register %08X\n", val); + return; + + case 0xF10: + //printf("cache debug instruction tag %08X\n", val); + return; + + case 0xF20: + //printf("cache debug data tag %08X\n", val); + return; + + case 0xF30: + //printf("cache debug instruction cache %08X\n", val); + return; + + case 0xF40: + //printf("cache debug data cache %08X\n", val); + return; + } if ((id&0xF00)!=0x700) @@ -704,7 +738,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { CodeCycles = 1; - return *(u32*)&ITCM[addr & 0x7FFF]; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } CodeCycles = RegionCodeCycles; @@ -726,16 +760,18 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) void ARMv5::DataRead8(u32 addr, u32* val) { + DataRegion = addr; + if (addr < ITCMSize) { DataCycles = 1; - *val = *(u8*)&ITCM[addr & 0x7FFF]; + *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -745,18 +781,20 @@ void ARMv5::DataRead8(u32 addr, u32* val) void ARMv5::DataRead16(u32 addr, u32* val) { + DataRegion = addr; + addr &= ~1; if (addr < ITCMSize) { DataCycles = 1; - *val = *(u16*)&ITCM[addr & 0x7FFF]; + *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -766,18 +804,20 @@ void ARMv5::DataRead16(u32 addr, u32* val) void ARMv5::DataRead32(u32 addr, u32* val) { + DataRegion = addr; + addr &= ~3; if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -792,13 +832,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -808,16 +848,21 @@ void ARMv5::DataRead32S(u32 addr, u32* val) void ARMv5::DataWrite8(u32 addr, u8 val) { + DataRegion = addr; + if (addr < ITCMSize) { DataCycles = 1; - *(u8*)&ITCM[addr & 0x7FFF] = val; + *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -827,18 +872,23 @@ void ARMv5::DataWrite8(u32 addr, u8 val) void ARMv5::DataWrite16(u32 addr, u16 val) { + DataRegion = addr; + addr &= ~1; if (addr < ITCMSize) { DataCycles = 1; - *(u16*)&ITCM[addr & 0x7FFF] = val; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -848,18 +898,23 @@ void ARMv5::DataWrite16(u32 addr, u16 val) void ARMv5::DataWrite32(u32 addr, u32 val) { + DataRegion = addr; + addr &= ~3; if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -874,13 +929,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } diff --git a/src/Config.cpp b/src/Config.cpp index 5745f34..de1c70d 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,14 @@ char DSiBIOS7Path[1024]; char DSiFirmwarePath[1024]; char DSiNANDPath[1024]; +#ifdef JIT_ENABLED +int JIT_Enable = false; +int JIT_MaxBlockSize = 32; +int JIT_BranchOptimisations = 2; +int JIT_LiteralOptimisations = true; +int JIT_FastMemory = true; +#endif + ConfigEntry ConfigFile[] = { {"BIOS9Path", 1, BIOS9Path, 0, "", 1023}, @@ -48,6 +56,14 @@ ConfigEntry ConfigFile[] = {"DSiFirmwarePath", 1, DSiFirmwarePath, 0, "", 1023}, {"DSiNANDPath", 1, DSiNANDPath, 0, "", 1023}, +#ifdef JIT_ENABLED + {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, + {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0}, + {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, + {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0}, +#endif + {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 3947598..5916b4a 100644 --- a/src/Config.h +++ b/src/Config.h @@ -51,6 +51,14 @@ extern char DSiBIOS7Path[1024]; extern char DSiFirmwarePath[1024]; extern char DSiNANDPath[1024]; +#ifdef JIT_ENABLED +extern int JIT_Enable; +extern int JIT_MaxBlockSize; +extern int JIT_BranchOptimisations; +extern int JIT_LiteralOptimisations; +extern int JIT_FastMemory; +#endif + } #endif // CONFIG_H diff --git a/src/DSi.cpp b/src/DSi.cpp index 216f724..97a63cd 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -26,6 +26,11 @@ #include "NDSCart.h" #include "Platform.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif + #include "DSi_NDMA.h" #include "DSi_I2C.h" #include "DSi_SD.h" @@ -34,15 +39,6 @@ #include "tiny-AES-c/aes.hpp" -namespace NDS -{ - -extern ARMv5* ARM9; -extern ARMv4* ARM7; - -} - - namespace DSi { @@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000]; u32 MBK[2][9]; -u8 NWRAM_A[0x40000]; -u8 NWRAM_B[0x40000]; -u8 NWRAM_C[0x40000]; +u8* NWRAM_A; +u8* NWRAM_B; +u8* NWRAM_C; u8* NWRAMMap_A[2][4]; u8* NWRAMMap_B[3][8]; @@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00]; bool Init() { +#ifndef JIT_ENABLED + NWRAM_A = new u8[NWRAMSize]; + NWRAM_B = new u8[NWRAMSize]; + NWRAM_C = new u8[NWRAMSize]; +#endif + if (!DSi_I2C::Init()) return false; if (!DSi_AES::Init()) return false; @@ -106,6 +108,12 @@ bool Init() void DeInit() { +#ifndef JIT_ENABLED + delete[] NWRAM_A; + delete[] NWRAM_B; + delete[] NWRAM_C; +#endif + DSi_I2C::DeInit(); DSi_AES::DeInit(); @@ -176,7 +184,12 @@ void SoftReset() NDS::ARM9->Reset(); NDS::ARM7->Reset(); + NDS::ARM9->CP15Reset(); + memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000); +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidateITCM(); +#endif DSi_AES::Reset(); @@ -274,9 +287,9 @@ bool LoadNAND() { printf("Loading DSi NAND\n"); - memset(NWRAM_A, 0, 0x40000); - memset(NWRAM_B, 0, 0x40000); - memset(NWRAM_C, 0, 0x40000); + memset(NWRAM_A, 0, NWRAMSize); + memset(NWRAM_B, 0, NWRAMSize); + memset(NWRAM_C, 0, NWRAMSize); memset(MBK, 0, sizeof(MBK)); memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A)); @@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(0); + int mbkn = 0, mbks = 8*num; u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(1); + int mbkn = 1+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(2); + int mbkn = 3+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val) u32 oldval = MBK[cpu][5+num]; if (oldval == val) return; + ARMJIT_Memory::RemapNWRAM(num); + MBK[cpu][5+num] = val; // TODO: what happens when the ranges are 'out of range'???? @@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write8(addr, val); @@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write16(addr, val); @@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write32(addr, val); @@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); +#endif + } return; } return NDS::ARM7Write8(addr, val); @@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write16(addr, val); @@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write32(addr, val); @@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr) case 0x04004501: return DSi_I2C::Cnt; case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF; - case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; + case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF; case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF; case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF; @@ -25,6 +25,8 @@ namespace DSi { +extern u16 SCFG_BIOS; + extern u8 ARM9iBIOS[0x10000]; extern u8 ARM7iBIOS[0x10000]; @@ -34,6 +36,19 @@ extern u64 ConsoleID; extern DSi_SDHost* SDMMC; extern DSi_SDHost* SDIO; +const u32 NWRAMSize = 0x40000; + +extern u8* NWRAM_A; +extern u8* NWRAM_B; +extern u8* NWRAM_C; + +extern u8* NWRAMMap_A[2][4]; +extern u8* NWRAMMap_B[3][8]; +extern u8* NWRAMMap_C[3][8]; + +extern u32 NWRAMStart[2][3]; +extern u32 NWRAMEnd[2][3]; +extern u32 NWRAMMask[2][3]; bool Init(); void DeInit(); diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp index 9984f5e..e22c708 100644 --- a/src/DSi_I2C.cpp +++ b/src/DSi_I2C.cpp @@ -21,6 +21,7 @@ #include "DSi.h" #include "DSi_I2C.h" #include "DSi_Camera.h" +#include "ARM.h" namespace DSi_BPTWL @@ -108,7 +109,8 @@ void Write(u8 val, bool last) printf("BPTWL: soft-reset\n"); val = 0; // checkme // TODO: soft-reset might need to be scheduled later! - DSi::SoftReset(); + // TODO: this has been moved for the JIT to work, nothing is confirmed here + NDS::ARM7->Halt(4); CurPos = -1; return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 22368ae..6981a42 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -33,6 +33,11 @@ #include "AREngine.h" #include "Platform.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif + #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -93,17 +98,17 @@ u32 CPUStop; u8 ARM9BIOS[0x1000]; u8 ARM7BIOS[0x4000]; -u8 MainRAM[0x1000000]; +u8* MainRAM; u32 MainRAMMask; -u8 SharedWRAM[0x8000]; +u8* SharedWRAM; u8 WRAMCnt; -u8* SWRAM_ARM9; -u8* SWRAM_ARM7; -u32 SWRAM_ARM9Mask; -u32 SWRAM_ARM7Mask; -u8 ARM7WRAM[0x10000]; +// putting them together so they're always next to each other +MemRegion SWRAM_ARM9; +MemRegion SWRAM_ARM7; + +u8* ARM7WRAM; u16 ExMemCnt[2]; @@ -168,6 +173,14 @@ bool Init() ARM9 = new ARMv5(); ARM7 = new ARMv4(); +#ifdef JIT_ENABLED + ARMJIT::Init(); +#else + MainRAM = new u8[0x1000000]; + ARM7WRAM = new u8[ARM7WRAMSize]; + SharedWRAM = new u8[SharedWRAMSize]; +#endif + DMAs[0] = new DMA(0, 0); DMAs[1] = new DMA(0, 1); DMAs[2] = new DMA(0, 2); @@ -200,6 +213,10 @@ void DeInit() delete ARM9; delete ARM7; +#ifdef JIT_ENABLED + ARMJIT::DeInit(); +#endif + for (int i = 0; i < 8; i++) delete DMAs[i]; @@ -249,11 +266,9 @@ void SetARM9RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq, ARM9MemTimings[i][3] = S32; } - addrstart <<= 14; - addrend <<= 14; - if (!addrend) addrend = 0xFFFFFFFF; - - ARM9->UpdateRegionTimings(addrstart, addrend); + ARM9->UpdateRegionTimings(addrstart<<14, addrend == 0x40000 + ? 0xFFFFFFFF + : (addrend<<14)); } void SetARM7RegionTimings(u32 addrstart, u32 addrend, int buswidth, int nonseq, int seq) @@ -478,6 +493,10 @@ void Reset() printf("ARM7 BIOS loaded\n"); fclose(f); } + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif if (ConsoleType == 1) { @@ -492,6 +511,10 @@ void Reset() ARM9ClockShift = 1; MainRAMMask = 0x3FFFFF; } + // has to be called before InitTimings + // otherwise some PU settings are completely + // unitialised on the first run + ARM9->CP15Reset(); ARM9Timestamp = 0; ARM9Target = 0; ARM7Timestamp = 0; ARM7Target = 0; @@ -499,7 +522,7 @@ void Reset() InitTimings(); - memset(MainRAM, 0, 0x1000000); + memset(MainRAM, 0, MainRAMMask + 1); memset(SharedWRAM, 0, 0x8000); memset(ARM7WRAM, 0, 0x10000); @@ -690,7 +713,7 @@ bool DoSavestate(Savestate* file) file->VarArray(MainRAM, 0x400000); file->VarArray(SharedWRAM, 0x8000); - file->VarArray(ARM7WRAM, 0x10000); + file->VarArray(ARM7WRAM, ARM7WRAMSize); file->VarArray(ExMemCnt, 2*sizeof(u16)); file->VarArray(ROMSeed0, 2*8); @@ -787,6 +810,13 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } +#ifdef JIT_ENABLED + if (!file->Saving) + { + ARMJIT::ResetBlockCache(); + } +#endif + return true; } @@ -877,6 +907,7 @@ void RunSystem(u64 timestamp) } } +template <bool EnableJIT> u32 RunFrame() { FrameStartTimestamp = SysTimestamp; @@ -910,7 +941,12 @@ u32 RunFrame() } else { - ARM9->Execute(); +#ifdef JIT_ENABLED + if (EnableJIT) + ARM9->ExecuteJIT(); + else +#endif + ARM9->Execute(); } RunTimers(0); @@ -933,7 +969,12 @@ u32 RunFrame() } else { - ARM7->Execute(); +#ifdef JIT_ENABLED + if (EnableJIT) + ARM7->ExecuteJIT(); + else +#endif + ARM7->Execute(); } RunTimers(1); @@ -963,6 +1004,16 @@ u32 RunFrame() return GPU::TotalScanlines; } +u32 RunFrame() +{ +#ifdef JIT_ENABLED + if (Config::JIT_Enable) + return RunFrame<true>(); + else +#endif + return RunFrame<false>(); +} + void Reschedule(u64 target) { if (CurCPU == 0) @@ -1082,36 +1133,41 @@ void Halt() void MapSharedWRAM(u8 val) { + if (val == WRAMCnt) + return; + + ARMJIT_Memory::RemapSWRAM(); + WRAMCnt = val; switch (WRAMCnt & 0x3) { case 0: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x7FFF; - SWRAM_ARM7 = NULL; - SWRAM_ARM7Mask = 0; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x7FFF; + SWRAM_ARM7.Mem = NULL; + SWRAM_ARM7.Mask = 0; break; case 1: - SWRAM_ARM9 = &SharedWRAM[0x4000]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 2: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0x4000]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 3: - SWRAM_ARM9 = NULL; - SWRAM_ARM9Mask = 0; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x7FFF; + SWRAM_ARM9.Mem = NULL; + SWRAM_ARM9.Mask = 0; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x7FFF; break; } } @@ -1166,9 +1222,9 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); if ((ConsoleType == 1) && cpu) - arm->IRQ |= (IE2 & IF2); + arm->IRQ |= !!(IE2 & IF2); } else { @@ -1787,9 +1843,9 @@ u8 ARM9Read8(u32 addr) return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1852,9 +1908,9 @@ u16 ARM9Read16(u32 addr) return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1917,9 +1973,9 @@ u32 ARM9Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1974,13 +2030,19 @@ void ARM9Write8(u32 addr, u8 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u8*)&MainRAM[addr & MainRAMMask] = val; return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2024,13 +2086,19 @@ void ARM9Write16(u32 addr, u16 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u16*)&MainRAM[addr & MainRAMMask] = val; return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2044,13 +2112,16 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG<u16>(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return; - default: GPU::WriteVRAM_LCDC<u16>(addr, val); return; + default: GPU::WriteVRAM_LCDC<u16>(addr, val); return; } case 0x07000000: @@ -2090,13 +2161,19 @@ void ARM9Write32(u32 addr, u32 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u32*)&MainRAM[addr & MainRAMMask] = val; return ; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2110,13 +2187,16 @@ void ARM9Write32(u32 addr, u32 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return; - default: GPU::WriteVRAM_LCDC<u32>(addr, val); return; + default: GPU::WriteVRAM_LCDC<u32>(addr, val); return; } case 0x07000000: @@ -2149,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val) return; } - printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); + //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); } bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) @@ -2162,10 +2242,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) return true; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - region->Mem = SWRAM_ARM9; - region->Mask = SWRAM_ARM9Mask; + region->Mem = SWRAM_ARM9.Mem; + region->Mask = SWRAM_ARM9.Mask; return true; } break; @@ -2204,17 +2284,17 @@ u8 ARM7Read8(u32 addr) return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead8(addr); @@ -2264,17 +2344,17 @@ u16 ARM7Read16(u32 addr) return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead16(addr); @@ -2331,17 +2411,17 @@ u32 ARM7Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead32(addr); @@ -2385,23 +2465,35 @@ void ARM7Write8(u32 addr, u8 val) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u8*)&MainRAM[addr & MainRAMMask] = val; return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2410,6 +2502,9 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); +#endif GPU::WriteVRAM_ARM7<u8>(addr, val); return; @@ -2444,23 +2539,35 @@ void ARM7Write16(u32 addr, u16 val) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u16*)&MainRAM[addr & MainRAMMask] = val; return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2477,6 +2584,9 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); +#endif GPU::WriteVRAM_ARM7<u16>(addr, val); return; @@ -2513,23 +2623,35 @@ void ARM7Write32(u32 addr, u32 val) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); +#endif *(u32*)&MainRAM[addr & MainRAMMask] = val; return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); +#endif + *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); +#endif + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2547,6 +2669,9 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); +#endif GPU::WriteVRAM_ARM7<u32>(addr, val); return; @@ -2594,17 +2719,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region) // then access all the WRAM as one contiguous block starting at 0x037F8000 // this case needs a bit of a hack to cover // it's not really worth bothering anyway - if (!SWRAM_ARM7) + if (!SWRAM_ARM7.Mem) { region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } break; case 0x03800000: region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } @@ -80,7 +80,7 @@ enum IRQ_IPCSendDone, IRQ_IPCRecv, IRQ_CartSendDone, // TODO: less misleading name - IRQ_CartIREQMC, // IRQ triggered by game cart (example: Pokémon Typing Adventure, BT controller) + IRQ_CartIREQMC, // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller) IRQ_GXFIFO, IRQ_LidOpen, IRQ_SPI, @@ -134,6 +134,7 @@ typedef struct } MemRegion; extern int ConsoleType; +extern int CurCPU; extern u8 ARM9MemTimings[0x40000][4]; extern u8 ARM7MemTimings[0x20000][4]; @@ -161,11 +162,22 @@ extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; extern u16 ARM7BIOSProt; -extern u8 MainRAM[0x1000000]; +extern u8* MainRAM; extern u32 MainRAMMask; +const u32 MainRAMMaxSize = 0x1000000; + +const u32 SharedWRAMSize = 0x8000; +extern u8* SharedWRAM; + +extern MemRegion SWRAM_ARM9; +extern MemRegion SWRAM_ARM7; + extern u32 KeyInput; +const u32 ARM7WRAMSize = 0x10000; +extern u8* ARM7WRAM; + bool Init(); void DeInit(); void Reset(); diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h new file mode 100644 index 0000000..40c4576 --- /dev/null +++ b/src/dolphin/Align.h @@ -0,0 +1,24 @@ +// This file is under the public domain. + +#pragma once + +#include <cstddef> +#include <type_traits> + +namespace Common +{ +template <typename T> +constexpr T AlignUp(T value, size_t size) +{ + static_assert(std::is_unsigned<T>(), "T must be an unsigned value."); + return static_cast<T>(value + (size - value % size) % size); +} + +template <typename T> +constexpr T AlignDown(T value, size_t size) +{ + static_assert(std::is_unsigned<T>(), "T must be an unsigned value."); + return static_cast<T>(value - value % size); +} + +} // namespace Common diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp new file mode 100644 index 0000000..dd2416b --- /dev/null +++ b/src/dolphin/Arm64Emitter.cpp @@ -0,0 +1,4466 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cinttypes> +#include <cstring> +#include <vector> + +#include "Compat.h" +#include "Align.h" +#include "Arm64Emitter.h" +#include "BitUtils.h" +#include "../types.h" +#include "MathUtil.h" + +namespace Arm64Gen +{ +namespace +{ +const int kWRegSizeInBits = 32; +const int kXRegSizeInBits = 64; + +// The below few functions are taken from V8. +int CountLeadingZeros(uint64_t value, int width) +{ + // TODO(jbramley): Optimize this for ARM64 hosts. + int count = 0; + uint64_t bit_test = 1ULL << (width - 1); + while ((count < width) && ((bit_test & value) == 0)) + { + count++; + bit_test >>= 1; + } + return count; +} + +uint64_t LargestPowerOf2Divisor(uint64_t value) +{ + return value & -(int64_t)value; +} + +// For ADD/SUB +bool IsImmArithmetic(uint64_t input, u32* val, bool* shift) +{ + if (input < 4096) + { + *val = input; + *shift = false; + return true; + } + else if ((input & 0xFFF000) == input) + { + *val = input >> 12; + *shift = true; + return true; + } + return false; +} + +// For AND/TST/ORR/EOR etc +bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s, + unsigned int* imm_r) +{ + // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL)); + // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits)); + + bool negate = false; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + + if (value & 1) + { + // If the low bit is 1, negate the value, and set a flag to remember that we + // did (so that we can adjust the return values appropriately). + negate = true; + value = ~value; + } + + if (width == kWRegSizeInBits) + { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // The most-significant 32 bits may not be zero (ie. negate is true) so + // shift the value left before duplicating it. + value <<= kWRegSizeInBits; + value |= value >> kWRegSizeInBits; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + uint64_t a = LargestPowerOf2Divisor(value); + uint64_t value_plus_a = value + a; + uint64_t b = LargestPowerOf2Divisor(value_plus_a); + uint64_t value_plus_a_minus_b = value_plus_a - b; + uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b); + + int d, clz_a, out_n; + uint64_t mask; + + if (c != 0) + { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + int clz_c = CountLeadingZeros(c, kXRegSizeInBits); + d = clz_a - clz_c; + mask = ((UINT64_C(1) << d) - 1); + out_n = 0; + } + else + { + // Handle degenerate cases. + // + // If any of those 'find lowest set bit' operations didn't find a set bit at + // all, then the word will have been zero thereafter, so in particular the + // last lowest_set_bit operation will have returned zero. So we can test for + // all the special case conditions in one go by seeing if c is zero. + if (a == 0) + { + // The input was zero (or all 1 bits, which will come to here too after we + // inverted it at the start of the function), for which we just return + // false. + return false; + } + else + { + // Otherwise, if c was zero but a was not, then there's just one stretch + // of set bits in our word, meaning that we have the trivial case of + // d == 64 and only one 'repetition'. Set up all the same variables as in + // the general case above, and set the N bit in the output. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + d = 64; + mask = ~UINT64_C(0); + out_n = 1; + } + } + + // If the repeat period d is not a power of two, it can't be encoded. + if (!MathUtil::IsPow2<u64>(d)) + return false; + + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + if (((b - a) & ~mask) != 0) + return false; + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + static const std::array<uint64_t, 6> multipliers = {{ + 0x0000000000000001UL, + 0x0000000100000001UL, + 0x0001000100010001UL, + 0x0101010101010101UL, + 0x1111111111111111UL, + 0x5555555555555555UL, + }}; + + int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57; + + // Ensure that the index to the multipliers array is within bounds. + DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size())); + + uint64_t multiplier = multipliers[multiplier_idx]; + uint64_t candidate = (b - a) * multiplier; + + // The candidate pattern doesn't match our input value, so fail. + if (value != candidate) + return false; + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits); + int s = clz_a - clz_b; + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + int r; + if (negate) + { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + s = d - s; + r = (clz_b + 1) & (d - 1); + } + else + { + r = (clz_a + 1) & (d - 1); + } + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (-d << 1) with our computed s to form imms. + *n = out_n; + *imm_s = ((-d << 1) | (s - 1)) & 0x3f; + *imm_r = r; + + return true; +} + +float FPImm8ToFloat(u8 bits) +{ + const u32 sign = bits >> 7; + const u32 bit6 = (bits >> 6) & 1; + const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3); + const u32 mantissa = (bits & 0xF) << 19; + const u32 f = (sign << 31) | (exp << 23) | mantissa; + + return Common::BitCast<float>(f); +} + +bool FPImm8FromFloat(float value, u8* imm_out) +{ + const u32 f = Common::BitCast<u32>(value); + const u32 mantissa4 = (f & 0x7FFFFF) >> 19; + const u32 exponent = (f >> 23) & 0xFF; + const u32 sign = f >> 31; + + if ((exponent >> 7) == ((exponent >> 6) & 1)) + return false; + + const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4; + const float new_float = FPImm8ToFloat(imm8); + if (new_float == value) + *imm_out = imm8; + else + return false; + + return true; +} +} // Anonymous namespace + +void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr) +{ + m_code = ptr; +} + +void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr) +{ + SetCodePtrUnsafe(ptr); + m_lastCacheFlushEnd = ptr; +} + +void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase) +{ + m_code = 0; + m_lastCacheFlushEnd = 0; + m_rwbase = rwbase; + m_rxbase = rxbase; +} + +ptrdiff_t ARM64XEmitter::GetCodeOffset() +{ + return m_code; +} + +const u8* ARM64XEmitter::GetRWPtr() +{ + return m_rwbase + m_code; +} + +u8* ARM64XEmitter::GetWriteableRWPtr() +{ + return m_rwbase + m_code; +} + +void* ARM64XEmitter::GetRXPtr() +{ + return m_rxbase + m_code; +} + +void ARM64XEmitter::ReserveCodeSpace(u32 bytes) +{ + for (u32 i = 0; i < bytes / 4; i++) + BRK(0); +} + +ptrdiff_t ARM64XEmitter::AlignCode16() +{ + int c = int((u64)m_code & 15); + if (c) + ReserveCodeSpace(16 - c); + return m_code; +} + +ptrdiff_t ARM64XEmitter::AlignCodePage() +{ + int c = int((u64)m_code & 4095); + if (c) + ReserveCodeSpace(4096 - c); + return m_code; +} + +void ARM64XEmitter::Write32(u32 value) +{ + std::memcpy(m_rwbase + m_code, &value, sizeof(u32)); + m_code += sizeof(u32); +} + +void ARM64XEmitter::FlushIcache() +{ + FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code); + m_lastCacheFlushEnd = m_code; +} + +void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end) +{ + if (start == end) + return; + +#if defined(IOS) + // Header file says this is equivalent to: sys_icache_invalidate(start, end - start); + sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start); +#else + // Don't rely on GCC's __clear_cache implementation, as it caches + // icache/dcache cache line sizes, that can vary between cores on + // big.LITTLE architectures. + u64 addr, ctr_el0; + static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; + size_t isize, dsize; + + __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); + isize = 4 << ((ctr_el0 >> 0) & 0xf); + dsize = 4 << ((ctr_el0 >> 16) & 0xf); + + // use the global minimum cache line size + icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; + dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; + + addr = (u64)start & ~(u64)(dsize - 1); + for (; addr < (u64)end; addr += dsize) + // use "civac" instead of "cvau", as this is the suggested workaround for + // Cortex-A53 errata 819472, 826319, 827319 and 824069. + __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); + __asm__ volatile("dsb ish" : : : "memory"); + + addr = (u64)start & ~(u64)(isize - 1); + for (; addr < (u64)end; addr += isize) + __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); + + __asm__ volatile("dsb ish" : : : "memory"); + __asm__ volatile("isb" : : : "memory"); +#endif +} + +// Exception generation +static const u32 ExcEnc[][3] = { + {0, 0, 1}, // SVC + {0, 0, 2}, // HVC + {0, 0, 3}, // SMC + {1, 0, 0}, // BRK + {2, 0, 0}, // HLT + {5, 0, 1}, // DCPS1 + {5, 0, 2}, // DCPS2 + {5, 0, 3}, // DCPS3 +}; + +// Arithmetic generation +static const u32 ArithEnc[] = { + 0x058, // ADD + 0x258, // SUB +}; + +// Conditional Select +static const u32 CondSelectEnc[][2] = { + {0, 0}, // CSEL + {0, 1}, // CSINC + {1, 0}, // CSINV + {1, 1}, // CSNEG +}; + +// Data-Processing (1 source) +static const u32 Data1SrcEnc[][2] = { + {0, 0}, // RBIT + {0, 1}, // REV16 + {0, 2}, // REV32 + {0, 3}, // REV64 + {0, 4}, // CLZ + {0, 5}, // CLS +}; + +// Data-Processing (2 source) +static const u32 Data2SrcEnc[] = { + 0x02, // UDIV + 0x03, // SDIV + 0x08, // LSLV + 0x09, // LSRV + 0x0A, // ASRV + 0x0B, // RORV + 0x10, // CRC32B + 0x11, // CRC32H + 0x12, // CRC32W + 0x14, // CRC32CB + 0x15, // CRC32CH + 0x16, // CRC32CW + 0x13, // CRC32X (64bit Only) + 0x17, // XRC32CX (64bit Only) +}; + +// Data-Processing (3 source) +static const u32 Data3SrcEnc[][2] = { + {0, 0}, // MADD + {0, 1}, // MSUB + {1, 0}, // SMADDL (64Bit Only) + {1, 1}, // SMSUBL (64Bit Only) + {2, 0}, // SMULH (64Bit Only) + {5, 0}, // UMADDL (64Bit Only) + {5, 1}, // UMSUBL (64Bit Only) + {6, 0}, // UMULH (64Bit Only) +}; + +// Logical (shifted register) +static const u32 LogicalEnc[][2] = { + {0, 0}, // AND + {0, 1}, // BIC + {1, 0}, // OOR + {1, 1}, // ORN + {2, 0}, // EOR + {2, 1}, // EON + {3, 0}, // ANDS + {3, 1}, // BICS +}; + +// Load/Store Exclusive +static const u32 LoadStoreExcEnc[][5] = { + {0, 0, 0, 0, 0}, // STXRB + {0, 0, 0, 0, 1}, // STLXRB + {0, 0, 1, 0, 0}, // LDXRB + {0, 0, 1, 0, 1}, // LDAXRB + {0, 1, 0, 0, 1}, // STLRB + {0, 1, 1, 0, 1}, // LDARB + {1, 0, 0, 0, 0}, // STXRH + {1, 0, 0, 0, 1}, // STLXRH + {1, 0, 1, 0, 0}, // LDXRH + {1, 0, 1, 0, 1}, // LDAXRH + {1, 1, 0, 0, 1}, // STLRH + {1, 1, 1, 0, 1}, // LDARH + {2, 0, 0, 0, 0}, // STXR + {3, 0, 0, 0, 0}, // (64bit) STXR + {2, 0, 0, 0, 1}, // STLXR + {3, 0, 0, 0, 1}, // (64bit) STLXR + {2, 0, 0, 1, 0}, // STXP + {3, 0, 0, 1, 0}, // (64bit) STXP + {2, 0, 0, 1, 1}, // STLXP + {3, 0, 0, 1, 1}, // (64bit) STLXP + {2, 0, 1, 0, 0}, // LDXR + {3, 0, 1, 0, 0}, // (64bit) LDXR + {2, 0, 1, 0, 1}, // LDAXR + {3, 0, 1, 0, 1}, // (64bit) LDAXR + {2, 0, 1, 1, 0}, // LDXP + {3, 0, 1, 1, 0}, // (64bit) LDXP + {2, 0, 1, 1, 1}, // LDAXP + {3, 0, 1, 1, 1}, // (64bit) LDAXP + {2, 1, 0, 0, 1}, // STLR + {3, 1, 0, 0, 1}, // (64bit) STLR + {2, 1, 1, 0, 1}, // LDAR + {3, 1, 1, 0, 1}, // (64bit) LDAR +}; + +void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr) +{ + bool b64Bit = Is64Bit(Rt); + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt); +} + +void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr) +{ + bool b64Bit = Is64Bit(Rt); + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) | + (((u32)distance << 5) & 0x7FFE0) | Rt); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr) +{ + s64 distance = (s64)ptr - s64(m_rxbase + m_code); + + ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, + __func__, distance); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF)); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn) +{ + Rn = DecodeReg(Rn); + Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4); +} + +void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d", + __func__, imm); + + Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) | + ExcEnc[instenc][2]); +} + +void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt) +{ + Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt); +} + +void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm, ArithOption Option) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) | + (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) | + Option.GetData() | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + bool b64Bit = Is64Bit(Rn); + + ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm); + ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) | + (1 << 11) | (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, + CCFlags cond) +{ + bool b64Bit = Is64Bit(Rm); + + ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) | + (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + CCFlags cond) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) | + (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) | + (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) | + Rd); +} + +void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Ra = DecodeReg(Ra); + Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) | + (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Shift) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) | + (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set + bitop |= 0x1; + Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, + ARM64Reg Rt) +{ + Rs = DecodeReg(Rs); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Rt = DecodeReg(Rt); + Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) | + (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) | + (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + u32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool b128Bit = IsQuad(Rt); + bool bVec = IsVector(Rt); + + if (b128Bit) + imm >>= 4; + else if (b64Bit) + imm >>= 3; + else + imm >>= 2; + + ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm); + + u32 opc = 0; + if (b128Bit) + opc = 2; + else if (b64Bit && bVec) + opc = 1; + else if (b64Bit && !bVec) + opc = 2; + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + u32 offset = imm & 0x1FF; + + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) | + Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size) +{ + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + if (size == 64) + imm >>= 3; + else if (size == 32) + imm >>= 2; + else if (size == 16) + imm >>= 1; + + ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm); + + Rd = DecodeReg(Rd); + Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd); +} + +void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) | + (imms << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) +{ + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() | + (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, + ARM64Reg Rd) +{ + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) | + (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, + int n) +{ + // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit. + // Use Rn to determine bitness here. + bool b64Bit = Is64Bit(Rn); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) | + (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm) +{ + bool b64Bit = Is64Bit(Rt); + u32 type_encode = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (b64Bit) + { + op |= 0b10; + imm >>= 3; + } + else + { + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} +void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) +{ + Rd = DecodeReg(Rd); + + Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, + imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +static constexpr bool IsInRangeImm19(s64 distance) +{ + return (distance >= -0x40000 && distance <= 0x3FFFF); +} + +static constexpr bool IsInRangeImm14(s64 distance) +{ + return (distance >= -0x2000 && distance <= 0x1FFF); +} + +static constexpr bool IsInRangeImm26(s64 distance) +{ + return (distance >= -0x2000000 && distance <= 0x1FFFFFF); +} + +static constexpr u32 MaskImm19(s64 distance) +{ + return distance & 0x7FFFF; +} + +static constexpr u32 MaskImm14(s64 distance) +{ + return distance & 0x3FFF; +} + +static constexpr u32 MaskImm26(s64 distance) +{ + return distance & 0x3FFFFFF; +} + +// FixupBranch branching +void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) +{ + bool Not = false; + u32 inst = 0; + s64 distance = (s64)(m_code - branch.ptr); + distance >>= 2; + + switch (branch.type) + { + case 1: // CBNZ + Not = true; + case 0: // CBZ + { + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + bool b64Bit = Is64Bit(branch.reg); + ARM64Reg reg = DecodeReg(branch.reg); + inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg; + } + break; + case 2: // B (conditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond; + break; + case 4: // TBNZ + Not = true; + case 3: // TBZ + { + ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + ARM64Reg reg = DecodeReg(branch.reg); + inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | + (MaskImm14(distance) << 5) | reg; + } + break; + case 5: // B (uncoditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x5 << 26) | MaskImm26(distance); + break; + case 6: // BL (unconditional) + ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x25 << 26) | MaskImm26(distance); + break; + } + + std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst)); +} + +FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 0; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 1; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B(CCFlags cond) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 2; + branch.cond = cond; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 3; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit) +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 4; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B() +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 5; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::BL() +{ + FixupBranch branch; + branch.ptr = m_code; + branch.type = 6; + HINT(HINT_NOP); + return branch; +} + +// Compare and Branch +void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr) +{ + EncodeCompareBranchInst(0, Rt, ptr); +} +void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr) +{ + EncodeCompareBranchInst(1, Rt, ptr); +} + +// Conditional Branch +void ARM64XEmitter::B(CCFlags cond, const void* ptr) +{ + s64 distance = (s64)ptr - (s64)(m_rxbase + m_code); + + distance >>= 2; + + ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), + "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr, + distance, distance); + Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond); +} + +// Test and Branch +void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr) +{ + EncodeTestBranchInst(0, Rt, bits, ptr); +} +void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr) +{ + EncodeTestBranchInst(1, Rt, bits, ptr); +} + +// Unconditional Branch +void ARM64XEmitter::B(const void* ptr) +{ + EncodeUnconditionalBranchInst(0, ptr); +} +void ARM64XEmitter::BL(const void* ptr) +{ + EncodeUnconditionalBranchInst(1, ptr); +} + +void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func) +{ + s64 distance = (s64)func - (s64)(m_rxbase + m_code); + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) + { + // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code, + // func); + MOVI2R(scratchreg, (uintptr_t)func); + BLR(scratchreg); + } + else + { + BL(func); + } +} + +void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func) +{ + s64 distance = (s64)func - (s64)(m_rxbase + m_code); + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) + { + // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code, + // func); + MOVI2R(scratchreg, (uintptr_t)func); + BR(scratchreg); + } + else + { + B(func); + } +} + +// Unconditional Branch (register) +void ARM64XEmitter::BR(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::BLR(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::RET(ARM64Reg Rn) +{ + EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::ERET() +{ + EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP); +} +void ARM64XEmitter::DRPS() +{ + EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP); +} + +// Exception generation +void ARM64XEmitter::SVC(u32 imm) +{ + EncodeExceptionInst(0, imm); +} + +void ARM64XEmitter::HVC(u32 imm) +{ + EncodeExceptionInst(1, imm); +} + +void ARM64XEmitter::SMC(u32 imm) +{ + EncodeExceptionInst(2, imm); +} + +void ARM64XEmitter::BRK(u32 imm) +{ + EncodeExceptionInst(3, imm); +} + +void ARM64XEmitter::HLT(u32 imm) +{ + EncodeExceptionInst(4, imm); +} + +void ARM64XEmitter::DCPS1(u32 imm) +{ + EncodeExceptionInst(5, imm); +} + +void ARM64XEmitter::DCPS2(u32 imm) +{ + EncodeExceptionInst(6, imm); +} + +void ARM64XEmitter::DCPS3(u32 imm) +{ + EncodeExceptionInst(7, imm); +} + +// System +void ARM64XEmitter::_MSR(PStateField field, u8 imm) +{ + u32 op1 = 0, op2 = 0; + switch (field) + { + case FIELD_SPSel: + op1 = 0; + op2 = 5; + break; + case FIELD_DAIFSet: + op1 = 3; + op2 = 6; + break; + case FIELD_DAIFClr: + op1 = 3; + op2 = 7; + break; + default: + ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to"); + break; + } + EncodeSystemInst(0, op1, 4, imm, op2, WSP); +} + +static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2) +{ + switch (field) + { + case FIELD_NZCV: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 2; + op2 = 0; + break; + case FIELD_FPCR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 0; + break; + case FIELD_FPSR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 1; + break; + case FIELD_PMCR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 6; + op2 = 0; + break; + case FIELD_PMCCNTR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 7; + op2 = 0; + break; + default: + ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to"); + break; + } +} + +void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) +{ + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt) +{ + ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit"); + + // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt + EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt)); +} + +void ARM64XEmitter::HINT(SystemHint op) +{ + EncodeSystemInst(0, 3, 2, 0, op, WSP); +} +void ARM64XEmitter::CLREX() +{ + EncodeSystemInst(0, 3, 3, 0, 2, WSP); +} +void ARM64XEmitter::DSB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 4, WSP); +} +void ARM64XEmitter::DMB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 5, WSP); +} +void ARM64XEmitter::ISB(BarrierType type) +{ + EncodeSystemInst(0, 3, 3, type, 6, WSP); +} + +// Add/Subtract (extended register) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm) +{ + CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm) +{ + CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) +{ + EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +// Add/Subtract (with carry) +void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm); +} +void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm); +} +void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm); +} +void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm); +} + +// Conditional Compare (immediate) +void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond); +} + +// Conditiona Compare (register) +void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) +{ + EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond); +} + +// Conditional Select +void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(0, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(1, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(2, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EncodeCondSelectInst(3, Rd, Rn, Rm, cond); +} + +// Data-Processing 1 source +void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(0, Rd, Rn); +} +void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(1, Rd, Rn); +} +void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(2, Rd, Rn); +} +void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(3, Rd, Rn); +} +void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(4, Rd, Rn); +} +void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn) +{ + EncodeData1SrcInst(5, Rd, Rn); +} + +// Data-Processing 2 source +void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(0, Rd, Rn, Rm); +} +void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(1, Rd, Rn, Rm); +} +void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(2, Rd, Rn, Rm); +} +void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(3, Rd, Rn, Rm); +} +void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(4, Rd, Rn, Rm); +} +void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(5, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(6, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(7, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(8, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(9, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(10, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(11, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(12, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData2SrcInst(13, Rd, Rn, Rm); +} + +// Data-Processing 3 source +void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(0, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(1, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(2, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + SMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(3, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(4, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(5, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + UMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EncodeData3SrcInst(6, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(7, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(0, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EncodeData3SrcInst(1, Rd, Rn, Rm, SP); +} + +// Logical (shifted register) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(0, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(1, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(2, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(3, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(4, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(5, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(6, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) +{ + EncodeLogicalInst(7, Rd, Rn, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift) +{ + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm) +{ + if (IsGPR(Rd) && IsGPR(Rm)) + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); + else + ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV"); +} +void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm) +{ + ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); +} +void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1); +} +void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + int bits = Is64Bit(Rd) ? 64 : 32; + SBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift) +{ + EXTR(Rd, Rm, Rm, shift); +} + +// Logical (immediate) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert) +{ + EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert); +} + +// Add/subtract (immediate) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP); +} + +// Data Processing (Immediate) +void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(2, Rd, imm, pos); +} +void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(0, Rd, imm, pos); +} +void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos) +{ + EncodeMOVWideInst(3, Rd, imm, pos); +} + +// Bitfield move +void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms); +} +void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms); +} +void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) +{ + EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms); +} + +void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG(DYNA_REC, (lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG(DYNA_REC, (lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) +{ + bool sf = Is64Bit(Rd); + bool N = sf; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd); +} +void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn) +{ + SBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn) +{ + SBFM(Rd, Rn, 0, 15); +} +void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__); + SBFM(Rd, Rn, 0, 31); +} +void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn) +{ + UBFM(Rd, Rn, 0, 15); +} + +// Load Register (Literal) +void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(0, Rt, imm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(2, Rt, imm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm) +{ + EncodeLoadRegisterInst(3, Rt, imm); +} + +// Load/Store pair +void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm); +} + +// Load/Store Exclusive +void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn) +{ + EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn); +} + +// Load/Store no-allocate pair (offset) +void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) +{ + EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) +{ + EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm); +} + +// Load/Store register (immediate post-indexed) +// XXX: Most of these support vectors +void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32); + else + EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} + +// Load/Store register (register offset) +void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm); +} + +// Load/Store register (unscaled offset) +void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__); + EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm); +} + +void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + switch (size | signExtend) + { + case 32: LDR (Rt, Rn, Rm); break; + case 33: LDRSW(Rt, Rn, Rm); break; + case 16: LDRH (Rt, Rn, Rm); break; + case 17: LDRSH(Rt, Rn, Rm); break; + case 8: LDRB (Rt, Rn, Rm); break; + case 9: LDRSB(Rt, Rn, Rm); break; + default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break; + } +} +void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + switch (size) + { + case 32: STR (Rt, Rn, Rm); break; + case 16: STRH (Rt, Rn, Rm); break; + case 8: STRB (Rt, Rn, Rm); break; + default: PanicAlert("STRGeneric(reg): invalid size %d", size); break; + } +} + +void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + switch (size | signExtend) + { + case 32: LDR (type, Rt, Rn, imm); break; + case 33: LDRSW(type, Rt, Rn, imm); break; + case 16: LDRH (type, Rt, Rn, imm); break; + case 17: LDRSH(type, Rt, Rn, imm); break; + case 8: LDRB (type, Rt, Rn, imm); break; + case 9: LDRSB(type, Rt, Rn, imm); break; + default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break; + } +} +void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + switch (size) + { + case 32: STR (type, Rt, Rn, imm); break; + case 16: STRH (type, Rt, Rn, imm); break; + case 8: STRB (type, Rt, Rn, imm); break; + default: PanicAlert("STRGeneric(imm): invalid size %d", size); break; + } +} + +// Address of label/page PC-relative +void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) +{ + EncodeAddressInst(0, Rd, imm); +} +void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) +{ + EncodeAddressInst(1, Rd, imm >> 12); +} + +// Wrapper around MOVZ+MOVK (and later MOVN) +void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) +{ + unsigned int parts = Is64Bit(Rd) ? 4 : 2; + BitSet32 upload_part(0); + + // Always start with a movz! Kills the dependency on the register. + bool use_movz = true; + + if (!imm) + { + // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks + // clearer in disasm too. + MOVZ(Rd, 0, SHIFT_0); + return; + } + + if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) || + (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max())) + { + // Max unsigned value (or if signed, -1) + // Set to ~ZR + ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP; + ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0)); + return; + } + + // TODO: Make some more systemic use of MOVN, but this will take care of most cases. + // Small negative integer. Use MOVN + if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) + { + MOVN(Rd, ~imm, SHIFT_0); + return; + } + + // XXX: Use MOVN when possible. + // XXX: Optimize more + // XXX: Support rotating immediates to save instructions + if (optimize) + { + for (unsigned int i = 0; i < parts; ++i) + { + if ((imm >> (i * 16)) & 0xFFFF) + upload_part[i] = 1; + } + } + + u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF; +s64 aligned_offset = (s64)imm - (s64)aligned_pc; + // The offset for ADR/ADRP is an s32, so make sure it can be represented in that + if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) + { + // Immediate we are loading is within 4GB of our aligned range + // Most likely a address that we can load in one or two instructions + if (!(std::abs(aligned_offset) & 0xFFF)) + { + // Aligned ADR + ADRP(Rd, (s32)aligned_offset); + return; + } + else + { + // If the address is within 1MB of PC we can load it in a single instruction still + s64 offset = (s64)imm - (s64)(m_rxbase + m_code); + if (offset >= -0xFFFFF && offset <= 0xFFFFF) + { + ADR(Rd, (s32)offset); + return; + } + else + { + ADRP(Rd, (s32)(aligned_offset & ~0xFFF)); + ADD(Rd, Rd, imm & 0xFFF); + return; + } + } + } + + for (unsigned i = 0; i < parts; ++i) + { + if (use_movz && upload_part[i]) + { + MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); + use_movz = false; + } + else + { + if (upload_part[i] || !optimize) + MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); + } + } +} + +bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) +{ + // TODO: Also optimize for performance, not just for code size. + ptrdiff_t start_offset = GetCodeOffset(); + + MOVI2R(Rd, imm1); + int size1 = GetCodeOffset() - start_offset; + + SetCodePtrUnsafe(start_offset); + + MOVI2R(Rd, imm2); + int size2 = GetCodeOffset() - start_offset; + + SetCodePtrUnsafe(start_offset); + + bool element = size1 > size2; + + MOVI2R(Rd, element ? imm2 : imm1); + + return element; +} + +void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) +{ + int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); + + if (!num_regs) + return; + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last write to avoid the dependency between those stores. + + // The first push must adjust the SP, else a context switch may invalidate everything below SP. + if (num_regs & 1) + { + STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size); + } + else + { + ARM64Reg first_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg second_reg = (ARM64Reg)(X0 + *it++); + STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size); + } + + // Fast store for all other registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + { + ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg even_reg = (ARM64Reg)(X0 + *it++); + STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1)); + } + + ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__); +} + +void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) +{ + int num_regs = registers.Count(); + int stack_size = (num_regs + (num_regs & 1)) * 8; + auto it = registers.begin(); + + if (!num_regs) + return; + + // We must adjust the SP in the end, so load the first (two) registers at least. + ARM64Reg first = (ARM64Reg)(X0 + *it++); + ARM64Reg second; + if (!(num_regs & 1)) + second = (ARM64Reg)(X0 + *it++); + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last load to avoid the dependency between those loads. + + // Fast load for all but the first (two) registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) + { + ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++); + ARM64Reg even_reg = (ARM64Reg)(X0 + *it++); + LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1)); + } + + // Post loading the first (two) registers. + if (num_regs & 1) + LDR(INDEX_POST, first, SP, stack_size); + else + LDP(INDEX_POST, first, second, SP, stack_size); + + ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__); +} + +// Float Emitter +void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, + ARM64Reg Rn, s32 imm) +{ + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + u32 encoded_size = 0; + u32 encoded_imm = 0; + + if (size == 8) + encoded_size = 0; + else if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + else if (size == 128) + encoded_size = 0; + + if (type == INDEX_UNSIGNED) + { + ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)), + "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__, + imm, m_emit->GetCodePtr()); + ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", + __func__); + if (size == 16) + imm >>= 1; + else if (size == 32) + imm >>= 2; + else if (size == 64) + imm >>= 3; + else if (size == 128) + imm >>= 4; + encoded_imm = (imm & 0xFFF); + } + else + { + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), + "%s immediate offset must be within range of -256 to 256!", __func__); + encoded_imm = (imm & 0x1FF) << 2; + if (type == INDEX_POST) + encoded_imm |= 1; + else + encoded_imm |= 3; + } + + Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) | + (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) | + (1 << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rd); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) | + (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) | + (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) | + (S << 12) | (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, + ARM64Reg Rd, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, + bool sign) +{ + DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point"); + if (IsGPR(Rd)) + { + // Use the encoding that transfers the result to a GPR. + bool sf = Is64Bit(Rd); + int type = IsDouble(Rn) ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = (sign ? 1 : 0); + int rmode = 0; + switch (round) + { + case ROUND_A: + rmode = 0; + opcode |= 4; + break; + case ROUND_P: + rmode = 1; + break; + case ROUND_M: + rmode = 2; + break; + case ROUND_Z: + rmode = 3; + break; + case ROUND_N: + rmode = 0; + break; + } + EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn); + } + else + { + // Use the encoding (vector, single) that keeps the result in the fp register. + int sz = IsDouble(Rn); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = 0; + switch (round) + { + case ROUND_A: + opcode = 0x1C; + break; + case ROUND_N: + opcode = 0x1A; + break; + case ROUND_M: + opcode = 0x1B; + break; + case ROUND_P: + opcode = 0x1A; + sz |= 2; + break; + case ROUND_Z: + opcode = 0x1B; + sz |= 2; + break; + } + Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) | + (Rn << 5) | Rd); + } +} + +void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, false); +} + +void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) +{ + EmitConvertScalarToInt(Rd, Rn, round, true); +} + +void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, + u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) | + (opcode << 16) | (scale << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rn); + + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) | + (1 << 13) | (Rn << 5) | opcode2); +} + +void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) | + (3 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); + + bool quad = IsQuad(Rd); + + u32 encoded_size = 0; + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + + bool is_double = !IsSingle(Rd); + + Rd = DecodeReg(Rd); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) | + (1 << 12) | (imm5 << 5) | Rd); +} + +void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, + ARM64Reg Rn) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) | + Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | + (encoded_size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, + ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) | + (opcode << 12) | (H << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, + imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, + ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + u32 type_encode = 0; + u32 opc = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (size == 128) + { + ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm); + opc = 2; + imm >>= 4; + } + else if (size == 64) + { + ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm); + opc = 1; + imm >>= 3; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm); + opc = 0; + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) +{ + ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, + "%s must contain an extended reg as Rm!", __func__); + + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + if (load) + encoded_op |= 1; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) | + Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) +{ + union + { + u8 hex; + struct + { + unsigned defgh : 5; + unsigned abc : 3; + }; + } v; + v.hex = abcdefgh; + Rd = DecodeReg(Rd); + Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) | + (1 << 10) | (v.defgh << 5) | Rd); +} + +void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); +} +void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm); +} + +// Loadstore unscaled +void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 1; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 1; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 1; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 1; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 3; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} +void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} + +// Loadstore single structure +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) +{ + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) + { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 16) + { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 32) + { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + else if (size == 64) + { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +// Loadstore multiple structure +void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); +} +void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) +{ + ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", + __func__); + ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); +} + +// Scalar - 1 Source +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) +{ + if (IsScalar(Rd) && IsScalar(Rn)) + { + EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn); + } + else + { + ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads"); + int rmode = 0; + int opcode = 6; + int sf = 0; + if (IsSingle(Rd) && !Is64Bit(Rn) && !top) + { + // GPR to scalar single + opcode |= 1; + } + else if (!Is64Bit(Rd) && IsSingle(Rn) && !top) + { + // Scalar single to GPR - defaults are correct + } + else + { + // TODO + ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case"); + } + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd); + } +} + +// Loadstore paired +void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) +{ + EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm); +} +void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) +{ + EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); +} + +// Loadstore register offset +void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); +} +void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn); +} +void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); +} + +// Scalar - 2 Source +void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0); +} +void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1); +} +void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2); +} +void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) +{ + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3); +} + +void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra, int opcode) +{ + int type = isDouble ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Ra = DecodeReg(Ra); + int o1 = opcode >> 1; + int o0 = opcode & 1; + m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) | + (Rn << 5) | Rd); +} + +// Scalar floating point immediate +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) +{ + EmitScalarImm(0, 0, 0, 0, Rd, imm8); +} + +// Vector +void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 0, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 1, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn); +} +void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn); +} +void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, 2, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn); +} +void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn); +} +void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn); +} + +// Move +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + u32 imm5 = 0; + + if (size == 8) + imm5 = 1; + else if (size == 16) + imm5 = 2; + else if (size == 32) + imm5 = 4; + else if (size == 64) + imm5 = 8; + + EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) +{ + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(1, 0, imm5, 3, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2) +{ + u32 imm5 = 0, imm4 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index1 << 1; + imm4 = index2; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index1 << 2; + imm4 = index2 << 1; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index1 << 3; + imm4 = index2 << 2; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index1 << 4; + imm4 = index2 << 3; + } + + EmitCopy(1, 1, imm5, imm4, Rd, Rn); +} + +void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64), + "%s must have a size of 64 when destination is 64bit!", __func__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + else if (size == 64) + { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn); +} +void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) +{ + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__); + u32 imm5 = 0; + + if (size == 8) + { + imm5 = 1; + imm5 |= index << 1; + } + else if (size == 16) + { + imm5 = 2; + imm5 |= index << 2; + } + else if (size == 32) + { + imm5 = 4; + imm5 |= index << 3; + } + + EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn); +} + +// One source +void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) +{ + u32 dst_encoding = 0; + u32 src_encoding = 0; + + if (size_to == 16) + dst_encoding = 3; + else if (size_to == 32) + dst_encoding = 0; + else if (size_to == 64) + dst_encoding = 1; + + if (size_from == 16) + src_encoding = 3; + else if (size_from == 32) + src_encoding = 0; + else if (size_from == 64) + src_encoding = 1; + + Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn); +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = false; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + EmitConversion(sf, 0, type, 0, 2, Rd, Rn); + } +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) +{ + if (IsScalar(Rn)) + { + // Source is in FP register (like destination!). We must use a vector encoding. + bool sign = true; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } + else + { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 3, Rd, Rn); + } +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) +{ + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) +{ + EmitCompare(0, 0, 0, 0, Rn, Rm); +} +void ARM64FloatEmitter::FCMP(ARM64Reg Rn) +{ + EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm) +{ + EmitCompare(0, 0, 0, 0x10, Rn, Rm); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn) +{ + EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); +} + +void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) +{ + EmitCondSelect(0, 0, cond, Rd, Rn, Rm); +} + +// Permute +void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b010, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b011, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitPermute(size, 0b111, Rd, Rn, Rm); +} + +// Shift by immediate +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, true); +} +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, true); +} + +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) + { + immh = 1; + } + else if (src_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (src_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) +{ + ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", + __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (dest_size == 8) + { + immh = 1; + } + else if (dest_size == 16) + { + immh = 2 | ((shift >> 3) & 1); + } + else if (dest_size == 32) + { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn); +} + +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) +{ + SSHLL(src_size, Rd, Rn, 0, upper); +} + +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) +{ + USHLL(src_size, Rd, Rn, 0, upper); +} + +// vector x indexed element +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); +} + +// Modified Immediate +void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 0; + u8 op = 0; + u8 abcdefgh = imm & 0xFF; + if (size == 8) + { + ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__); + } + else if (size == 16) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", + __func__); + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__); + switch (shift) + { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } + else // 64 + { + ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__); + + op = 1; + cmode = 0xE; + abcdefgh = 0; + for (int i = 0; i < 8; ++i) + { + u8 tmp = (imm >> (i << 3)) & 0xFF; + ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__); + if (tmp == 0xFF) + abcdefgh |= (1 << i); + } + } + EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 1; + u8 op = 1; + if (size == 16) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", + __func__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + switch (shift) + { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } + else + { + ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__); + } + EncodeModImm(Q, op, cmode, 0, Rd, imm); +} + +void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) +{ + bool bundled_loadstore = false; + + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + int num_regs = registers.Count(); + m_emit->SUB(SP, SP, num_regs * 16); + m_emit->ADD(tmp, SP, 0); + std::vector<ARM64Reg> island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + + // 0 = true + // 1 < 4 && registers[i + 1] true! + // 2 < 4 && registers[i + 2] true! + // 3 < 4 && registers[i + 3] true! + // 4 < 4 && registers[i + 4] false! + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); + + i += count - 1; + } + + // Handle island registers + std::vector<ARM64Reg> pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_POST, pair_regs[0], tmp, 16); + } + else + { + std::vector<ARM64Reg> pair_regs; + for (auto it : registers) + { + pair_regs.push_back((ARM64Reg)(Q0 + it)); + if (pair_regs.size() == 2) + { + STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_PRE, pair_regs[0], SP, -16); + } +} +void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) +{ + bool bundled_loadstore = false; + int num_regs = registers.Count(); + + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + // The temporary register is only used to indicate that we can use this code path + std::vector<ARM64Reg> island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) + { + } + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + + i += count - 1; + } + + // Handle island registers + std::vector<ARM64Reg> pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + LDR(128, INDEX_POST, pair_regs[0], SP, 16); + } + else + { + bool odd = num_regs % 2; + std::vector<ARM64Reg> pair_regs; + for (int i = 31; i >= 0; --i) + { + if (!registers[i]) + continue; + + if (odd) + { + // First load must be a regular LDR if odd + odd = false; + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + else + { + pair_regs.push_back((ARM64Reg)(Q0 + i)); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); + pair_regs.clear(); + } + } + } + } +} + +void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (!Is64Bit(Rn)) + imm &= 0xFFFFFFFF; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + AND(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ANDI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + AND(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ORR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ORRI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + ORR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + EOR(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "EORI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + EOR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) + { + ANDS(Rd, Rn, imm_r, imm_s, n != 0); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "ANDSI2R - failed to construct logical immediate value from %08x, need scratch", + (u32)imm); + MOVI2R(scratch, imm); + ANDS(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, + bool flags) +{ + switch ((negative << 1) | flags) + { + case 0: + ADD(Rd, Rn, imm, shift); + break; + case 1: + ADDS(Rd, Rn, imm, shift); + break; + case 2: + SUB(Rd, Rn, imm, shift); + break; + case 3: + SUBS(Rd, Rn, imm, shift); + break; + } +} + +void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch) +{ + bool has_scratch = scratch != INVALID_REG; + u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL; + bool neg_neg = negative ? false : true; + + // Fast paths, aarch64 immediate instructions + // Try them all first + if (imm <= 0xFFF) + { + AddImmediate(Rd, Rn, imm, false, negative, flags); + return; + } + if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0) + { + AddImmediate(Rd, Rn, imm >> 12, true, negative, flags); + return; + } + if (imm_neg <= 0xFFF) + { + AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags); + return; + } + if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0) + { + AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags); + return; + } + + // ADD+ADD is slower than MOVK+ADD, but inplace. + // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD. + // As this splits the addition in two parts, this must not be done on setting flags. + if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u) + { + AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false); + AddImmediate(Rd, Rd, imm >> 12, true, negative, false); + return; + } + if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u) + { + AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false); + AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false); + return; + } + + ASSERT_MSG(DYNA_REC, has_scratch, + "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch", + (u32)imm); + + negative ^= MOVI2R2(scratch, imm, imm_neg); + switch ((negative << 1) | flags) + { + case 0: + ADD(Rd, Rn, scratch); + break; + case 1: + ADDS(Rd, Rn, scratch); + break; + case 2: + SUB(Rd, Rn, scratch); + break; + case 3: + SUBS(Rd, Rn, scratch); + break; + } +} + +void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, false, false, scratch); +} + +void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, false, true, scratch); +} + +void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, true, false, scratch); +} + +void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Rd, Rn, imm, true, true, scratch); +} + +void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) +{ + ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch); +} + +bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + ADD(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + SUB(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) +{ + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + CMP(Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + AND(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + ORR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +{ + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + EOR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} + +void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) +{ + ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision"); + uint8_t imm8; + if (value == 0.0) + { + FMOV(Rd, IsDouble(Rd) ? ZR : WZR); + if (negate) + FNEG(Rd, Rd); + // TODO: There are some other values we could generate with the float-imm instruction, like + // 1.0... + } + else if (FPImm8FromFloat(value, &imm8)) + { + FMOV(Rd, imm8); + } + else + { + ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, + "Failed to find a way to generate FP immediate %f without scratch", value); + if (negate) + value = -value; + + const u32 ival = Common::BitCast<u32>(value); + m_emit->MOVI2R(scratch, ival); + FMOV(Rd, scratch); + } +} + +// TODO: Quite a few values could be generated easily using the MOVI instruction and friends. +void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) +{ + // TODO: Make it work with more element sizes + // TODO: Optimize - there are shorter solution for many values + ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd)); + MOVI2F(s, value, scratch); + DUP(32, Rd, Rd, 0); +} + +} // namespace Arm64Gen diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h new file mode 100644 index 0000000..3d9d4ba --- /dev/null +++ b/src/dolphin/Arm64Emitter.h @@ -0,0 +1,1151 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include <cstring> +#include <functional> + +#include "ArmCommon.h" +#include "BitSet.h" +#include "Compat.h" + +namespace Arm64Gen +{ +// X30 serves a dual purpose as a link register +// Encoded as <u3:type><u5:reg> +// Types: +// 000 - 32bit GPR +// 001 - 64bit GPR +// 010 - VFP single precision +// 100 - VFP double precision +// 110 - VFP quad precision +enum ARM64Reg +{ + // 32bit registers + W0 = 0, + W1, + W2, + W3, + W4, + W5, + W6, + W7, + W8, + W9, + W10, + W11, + W12, + W13, + W14, + W15, + W16, + W17, + W18, + W19, + W20, + W21, + W22, + W23, + W24, + W25, + W26, + W27, + W28, + W29, + W30, + + WSP, // 32bit stack pointer + + // 64bit registers + X0 = 0x20, + X1, + X2, + X3, + X4, + X5, + X6, + X7, + X8, + X9, + X10, + X11, + X12, + X13, + X14, + X15, + X16, + X17, + X18, + X19, + X20, + X21, + X22, + X23, + X24, + X25, + X26, + X27, + X28, + X29, + X30, + + SP, // 64bit stack pointer + + // VFP single precision registers + S0 = 0x40, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + S8, + S9, + S10, + S11, + S12, + S13, + S14, + S15, + S16, + S17, + S18, + S19, + S20, + S21, + S22, + S23, + S24, + S25, + S26, + S27, + S28, + S29, + S30, + S31, + + // VFP Double Precision registers + D0 = 0x80, + D1, + D2, + D3, + D4, + D5, + D6, + D7, + D8, + D9, + D10, + D11, + D12, + D13, + D14, + D15, + D16, + D17, + D18, + D19, + D20, + D21, + D22, + D23, + D24, + D25, + D26, + D27, + D28, + D29, + D30, + D31, + + // ASIMD Quad-Word registers + Q0 = 0xC0, + Q1, + Q2, + Q3, + Q4, + Q5, + Q6, + Q7, + Q8, + Q9, + Q10, + Q11, + Q12, + Q13, + Q14, + Q15, + Q16, + Q17, + Q18, + Q19, + Q20, + Q21, + Q22, + Q23, + Q24, + Q25, + Q26, + Q27, + Q28, + Q29, + Q30, + Q31, + + // For PRFM(prefetch memory) encoding + // This is encoded in the Rt register + // Data preload + PLDL1KEEP = 0, + PLDL1STRM, + PLDL2KEEP, + PLDL2STRM, + PLDL3KEEP, + PLDL3STRM, + // Instruction preload + PLIL1KEEP = 8, + PLIL1STRM, + PLIL2KEEP, + PLIL2STRM, + PLIL3KEEP, + PLIL3STRM, + // Prepare for store + PLTL1KEEP = 16, + PLTL1STRM, + PLTL2KEEP, + PLTL2STRM, + PLTL3KEEP, + PLTL3STRM, + + WZR = WSP, + ZR = SP, + + INVALID_REG = 0xFFFFFFFF +}; + +constexpr bool Is64Bit(ARM64Reg reg) +{ + return (reg & 0x20) != 0; +} +constexpr bool IsSingle(ARM64Reg reg) +{ + return (reg & 0xC0) == 0x40; +} +constexpr bool IsDouble(ARM64Reg reg) +{ + return (reg & 0xC0) == 0x80; +} +constexpr bool IsScalar(ARM64Reg reg) +{ + return IsSingle(reg) || IsDouble(reg); +} +constexpr bool IsQuad(ARM64Reg reg) +{ + return (reg & 0xC0) == 0xC0; +} +constexpr bool IsVector(ARM64Reg reg) +{ + return (reg & 0xC0) != 0; +} +constexpr bool IsGPR(ARM64Reg reg) +{ + return static_cast<int>(reg) < 0x40; +} + +constexpr ARM64Reg DecodeReg(ARM64Reg reg) +{ + return static_cast<ARM64Reg>(reg & 0x1F); +} +constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg) +{ + return static_cast<ARM64Reg>(reg | 0x20); +} +constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg) +{ + return static_cast<ARM64Reg>(DecodeReg(reg) + S0); +} +constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg) +{ + return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80); +} +constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg) +{ + return static_cast<ARM64Reg>(reg | 0xC0); +} + +enum OpType +{ + TYPE_IMM = 0, + TYPE_REG, + TYPE_IMMSREG, + TYPE_RSR, + TYPE_MEM +}; + +enum ShiftType +{ + ST_LSL = 0, + ST_LSR = 1, + ST_ASR = 2, + ST_ROR = 3, +}; + +enum IndexType +{ + INDEX_UNSIGNED, + INDEX_POST, + INDEX_PRE, + INDEX_SIGNED, // used in LDP/STP +}; + +enum ShiftAmount +{ + SHIFT_0 = 0, + SHIFT_16 = 1, + SHIFT_32 = 2, + SHIFT_48 = 3, +}; + +enum RoundingMode +{ + ROUND_A, // round to nearest, ties to away + ROUND_M, // round towards -inf + ROUND_N, // round to nearest, ties to even + ROUND_P, // round towards +inf + ROUND_Z, // round towards zero +}; + +struct FixupBranch +{ + ptrdiff_t ptr; + // Type defines + // 0 = CBZ (32bit) + // 1 = CBNZ (32bit) + // 2 = B (conditional) + // 3 = TBZ + // 4 = TBNZ + // 5 = B (unconditional) + // 6 = BL (unconditional) + u32 type; + + // Used with B.cond + CCFlags cond; + + // Used with TBZ/TBNZ + u8 bit; + + // Used with Test/Compare and Branch + ARM64Reg reg; +}; + +enum PStateField +{ + FIELD_SPSel = 0, + FIELD_DAIFSet, + FIELD_DAIFClr, + FIELD_NZCV, // The only system registers accessible from EL0 (user space) + FIELD_PMCR_EL0, + FIELD_PMCCNTR_EL0, + FIELD_FPCR = 0x340, + FIELD_FPSR = 0x341, +}; + +enum SystemHint +{ + HINT_NOP = 0, + HINT_YIELD, + HINT_WFE, + HINT_WFI, + HINT_SEV, + HINT_SEVL, +}; + +enum BarrierType +{ + OSHLD = 1, + OSHST = 2, + OSH = 3, + NSHLD = 5, + NSHST = 6, + NSH = 7, + ISHLD = 9, + ISHST = 10, + ISH = 11, + LD = 13, + ST = 14, + SY = 15, +}; + +class ArithOption +{ +public: + enum WidthSpecifier + { + WIDTH_DEFAULT, + WIDTH_32BIT, + WIDTH_64BIT, + }; + + enum ExtendSpecifier + { + EXTEND_UXTB = 0x0, + EXTEND_UXTH = 0x1, + EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */ + EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */ + EXTEND_SXTB = 0x4, + EXTEND_SXTH = 0x5, + EXTEND_SXTW = 0x6, + EXTEND_SXTX = 0x7, + }; + + enum TypeSpecifier + { + TYPE_EXTENDEDREG, + TYPE_IMM, + TYPE_SHIFTEDREG, + }; + +private: + ARM64Reg m_destReg; + WidthSpecifier m_width; + ExtendSpecifier m_extend; + TypeSpecifier m_type; + ShiftType m_shifttype; + u32 m_shift; + +public: + ArithOption(ARM64Reg Rd, bool index = false) + { + // Indexed registers are a certain feature of AARch64 + // On Loadstore instructions that use a register offset + // We can have the register as an index + // If we are indexing then the offset register will + // be shifted to the left so we are indexing at intervals + // of the size of what we are loading + // 8-bit: Index does nothing + // 16-bit: Index LSL 1 + // 32-bit: Index LSL 2 + // 64-bit: Index LSL 3 + if (index) + m_shift = 4; + else + m_shift = 0; + + m_destReg = Rd; + m_type = TYPE_EXTENDEDREG; + if (Is64Bit(Rd)) + { + m_width = WIDTH_64BIT; + m_extend = EXTEND_UXTX; + } + else + { + m_width = WIDTH_32BIT; + m_extend = EXTEND_UXTW; + } + m_shifttype = ST_LSL; + } + ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) + { + m_destReg = Rd; + m_shift = shift; + m_shifttype = shift_type; + m_type = TYPE_SHIFTEDREG; + if (Is64Bit(Rd)) + { + m_width = WIDTH_64BIT; + if (shift == 64) + m_shift = 0; + } + else + { + m_width = WIDTH_32BIT; + if (shift == 32) + m_shift = 0; + } + } + TypeSpecifier GetType() const { return m_type; } + ARM64Reg GetReg() const { return m_destReg; } + u32 GetData() const + { + switch (m_type) + { + case TYPE_EXTENDEDREG: + return (m_extend << 13) | (m_shift << 10); + break; + case TYPE_SHIFTEDREG: + return (m_shifttype << 22) | (m_shift << 10); + break; + default: + DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData"); + break; + } + return 0; + } +}; + +class ARM64XEmitter +{ + friend class ARM64FloatEmitter; + +private: + ptrdiff_t m_code; + ptrdiff_t m_lastCacheFlushEnd; + u8* m_rwbase; + u8* m_rxbase; + + void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags); + void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr); + void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr); + void EncodeUnconditionalBranchInst(u32 op, const void* ptr); + void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn); + void EncodeExceptionInst(u32 instenc, u32 imm); + void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt); + void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Option); + void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn); + void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm); + void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt); + void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size); + void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos); + void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); + void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n); + void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm); + void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); + void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + +protected: + // TODO: make this less ugly + // used for Switch where memory is executable and writeable and different addresses + // we need to take this for relative addressing in account + + void Write32(u32 value); + +public: + ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {} + ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset) + { + m_rwbase = rwbase; + m_rxbase = rxbase; + m_code = offset; + m_lastCacheFlushEnd = offset; + } + + virtual ~ARM64XEmitter() {} + void SetCodePtr(ptrdiff_t ptr); + void SetCodePtrUnsafe(ptrdiff_t ptr); + void SetCodeBase(u8* rwbase, u8* rxbase); + void ReserveCodeSpace(u32 bytes); + ptrdiff_t AlignCode16(); + ptrdiff_t AlignCodePage(); + ptrdiff_t GetCodeOffset(); + const u8* GetRWPtr(); + u8* GetWriteableRWPtr(); + void* GetRXPtr(); + void FlushIcache(); + void FlushIcacheSection(u8* start, u8* end); + + // FixupBranch branching + void SetJumpTarget(FixupBranch const& branch); + FixupBranch CBZ(ARM64Reg Rt); + FixupBranch CBNZ(ARM64Reg Rt); + FixupBranch B(CCFlags cond); + FixupBranch TBZ(ARM64Reg Rt, u8 bit); + FixupBranch TBNZ(ARM64Reg Rt, u8 bit); + FixupBranch B(); + FixupBranch BL(); + + // Compare and Branch + void CBZ(ARM64Reg Rt, const void* ptr); + void CBNZ(ARM64Reg Rt, const void* ptr); + + // Conditional Branch + void B(CCFlags cond, const void* ptr); + + // Test and Branch + void TBZ(ARM64Reg Rt, u8 bits, const void* ptr); + void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr); + + // Unconditional Branch + void B(const void* ptr); + void BL(const void* ptr); + + // Unconditional Branch (register) + void BR(ARM64Reg Rn); + void BLR(ARM64Reg Rn); + void RET(ARM64Reg Rn = X30); + void ERET(); + void DRPS(); + + // Exception generation + void SVC(u32 imm); + void HVC(u32 imm); + void SMC(u32 imm); + void BRK(u32 imm); + void HLT(u32 imm); + void DCPS1(u32 imm); + void DCPS2(u32 imm); + void DCPS3(u32 imm); + + // System + void _MSR(PStateField field, u8 imm); + void _MSR(PStateField field, ARM64Reg Rt); + void MRS(ARM64Reg Rt, PStateField field); + void CNTVCT(ARM64Reg Rt); + + void HINT(SystemHint op); + void CLREX(); + void DSB(BarrierType type); + void DMB(BarrierType type); + void ISB(BarrierType type); + + // Add/Subtract (Extended/Shifted register) + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMN(ARM64Reg Rn, ARM64Reg Rm); + void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMP(ARM64Reg Rn, ARM64Reg Rm); + void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + + // Add/Subtract (with carry) + void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Conditional Compare (immediate) + void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + + // Conditional Compare (register) + void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + + // Conditional Select + void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Aliases + void CSET(ARM64Reg Rd, CCFlags cond) + { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void CSETM(ARM64Reg Rd, CCFlags cond) + { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); } + // Data-Processing 1 source + void RBIT(ARM64Reg Rd, ARM64Reg Rn); + void REV16(ARM64Reg Rd, ARM64Reg Rn); + void REV32(ARM64Reg Rd, ARM64Reg Rn); + void REV64(ARM64Reg Rd, ARM64Reg Rn); + void CLZ(ARM64Reg Rd, ARM64Reg Rn); + void CLS(ARM64Reg Rd, ARM64Reg Rn); + + // Data-Processing 2 source + void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Data-Processing 3 source + void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Logical (shifted register) + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + + // Wrap the above for saner syntax + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); } + // Convenience wrappers around ORR. These match the official convenience syntax. + void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift); + void MOV(ARM64Reg Rd, ARM64Reg Rm); + void MVN(ARM64Reg Rd, ARM64Reg Rm); + + // Convenience wrappers around UBFM/EXTR. + void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift); + + // Logical (immediate) + void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); } + // Add/subtract (immediate) + void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void CMP(ARM64Reg Rn, u32 imm, bool shift = false); + + // Data Processing (Immediate) + void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + + // Bitfield move + void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + + // Extract register (ROR with two inputs, if same then faster on A67) + void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); + + // Aliases + void SXTB(ARM64Reg Rd, ARM64Reg Rn); + void SXTH(ARM64Reg Rd, ARM64Reg Rn); + void SXTW(ARM64Reg Rd, ARM64Reg Rn); + void UXTB(ARM64Reg Rd, ARM64Reg Rn); + void UXTH(ARM64Reg Rd, ARM64Reg Rn); + + void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); } + // Load Register (Literal) + void LDR(ARM64Reg Rt, u32 imm); + void LDRSW(ARM64Reg Rt, u32 imm); + void PRFM(ARM64Reg Rt, u32 imm); + + // Load/Store Exclusive + void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRB(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRB(ARM64Reg Rt, ARM64Reg Rn); + void STLRB(ARM64Reg Rt, ARM64Reg Rn); + void LDARB(ARM64Reg Rt, ARM64Reg Rn); + void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRH(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRH(ARM64Reg Rt, ARM64Reg Rn); + void STLRH(ARM64Reg Rt, ARM64Reg Rn); + void LDARH(ARM64Reg Rt, ARM64Reg Rn); + void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDXR(ARM64Reg Rt, ARM64Reg Rn); + void LDAXR(ARM64Reg Rt, ARM64Reg Rn); + void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLR(ARM64Reg Rt, ARM64Reg Rn); + void LDAR(ARM64Reg Rt, ARM64Reg Rn); + + // Load/Store no-allocate pair (offset) + void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + + // Load/Store register (immediate indexed) + void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store register (register offset) + void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Load/Store register (unscaled offset) + void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store pair + void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Address of label/page PC-relative + void ADR(ARM64Reg Rd, s32 imm); + void ADRP(ARM64Reg Rd, s32 imm); + + // Wrapper around MOVZ+MOVK + void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); + bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); + template <class P> + void MOVP2R(ARM64Reg Rd, P* ptr) + { + ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers"); + MOVI2R(Rd, (uintptr_t)ptr); + } + + // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch + // register. + void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) + { + ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); + } + void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch); + void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryCMPI2R(ARM64Reg Rn, u32 imm); + + bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + + // ABI related + void ABI_PushRegisters(BitSet32 registers); + void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); + + // Utility to generate a call to a std::function object. + // + // Unfortunately, calling operator() directly is undefined behavior in C++ + // (this method might be a thunk in the case of multi-inheritance) so we + // have to go through a trampoline function. + template <typename T, typename... Args> + static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args) + { + return (*f)(args...); + } + + // This function expects you to have set up the state. + // Overwrites X0 and X30 + template <typename T, typename... Args> + ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f) + { + auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>; + MOVI2R(X30, (uintptr_t)trampoline); + MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f)); + return X30; + } + + void QuickTailCall(ARM64Reg scratchreg, const void* func); + template <typename T> + void QuickTailCall(ARM64Reg scratchreg, T func) + { + QuickTailCall(scratchreg, (const void*)func); + } + + // Plain function call + void QuickCallFunction(ARM64Reg scratchreg, const void* func); + template <typename T> + void QuickCallFunction(ARM64Reg scratchreg, T func) + { + QuickCallFunction(scratchreg, (const void*)func); + } +}; + +class ARM64FloatEmitter +{ +public: + ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {} + void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore unscaled + void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore single structure + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + + // Loadstore multiple structure + void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + + // Loadstore paired + void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + // Loadstore register offset + void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Scalar - 1 Source + void FABS(ARM64Reg Rd, ARM64Reg Rn); + void FNEG(ARM64Reg Rd, ARM64Reg Rn); + void FSQRT(ARM64Reg Rd, ARM64Reg Rn); + void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP + + // Scalar - 2 Source + void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Scalar - 3 Source. Note - the accumulator is last on ARM! + void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + + // Scalar floating point immediate + void FMOV(ARM64Reg Rd, uint8_t imm8); + + // Vector + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void NOT(ARM64Reg Rd, ARM64Reg Rn); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); } + void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + + // Move + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2); + void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + + // One source + void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); + + // Scalar convert float to int, in a lot of variants. + // Note that the scalar version of this operation has two encodings, one that goes to an integer + // register + // and one that outputs to a scalar fp register. + void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + + // Scalar convert int to float. No rounding mode specifier necessary. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn); + + // Scalar fixed point to float. scale is the number of fractional bits. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + + // Float comparison + void FCMP(ARM64Reg Rn, ARM64Reg Rm); + void FCMP(ARM64Reg Rn); + void FCMPE(ARM64Reg Rn, ARM64Reg Rm); + void FCMPE(ARM64Reg Rn); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + + // Conditional select + void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Permute + void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Shift by immediate + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + + // vector x indexed element + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + + // Modified Immediate + void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); + void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); + + void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); + void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); + + // ABI related + void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + +private: + ARM64XEmitter* m_emit; + inline void Write32(u32 value) { m_emit->Write32(value); } + // Emitting functions + void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); + void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn, ARM64Reg Rm); + void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, + ARM64Reg Rd, ARM64Reg Rn); + void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); + void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8); + void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm); + void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); + void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, + int opcode); + void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm); + void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); + + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); +}; + +}
\ No newline at end of file diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h new file mode 100644 index 0000000..6d82e9d --- /dev/null +++ b/src/dolphin/ArmCommon.h @@ -0,0 +1,27 @@ +// Copyright 2014 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "../types.h" + +enum CCFlags +{ + CC_EQ = 0, // Equal + CC_NEQ, // Not equal + CC_CS, // Carry Set + CC_CC, // Carry Clear + CC_MI, // Minus (Negative) + CC_PL, // Plus + CC_VS, // Overflow + CC_VC, // No Overflow + CC_HI, // Unsigned higher + CC_LS, // Unsigned lower or same + CC_GE, // Signed greater than or equal + CC_LT, // Signed less than + CC_GT, // Signed greater than + CC_LE, // Signed less than or equal + CC_AL, // Always (unconditional) 14 + CC_HS = CC_CS, // Alias of CC_CS Unsigned higher or same + CC_LO = CC_CC, // Alias of CC_CC Unsigned lower +}; +const u32 NO_COND = 0xE0000000; diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h new file mode 100644 index 0000000..d32b020 --- /dev/null +++ b/src/dolphin/BitSet.h @@ -0,0 +1,218 @@ +// This file is under the public domain. + +#pragma once + +#include <cstddef> +#include <initializer_list> +#include <type_traits> +#include "../types.h" + +#ifdef _WIN32 + +#include <intrin.h> + +namespace Common +{ +template <typename T> +constexpr int CountSetBits(T v) +{ + // from https://graphics.stanford.edu/~seander/bithacks.html + // GCC has this built in, but MSVC's intrinsic will only emit the actual + // POPCNT instruction, which we're not depending on + v = v - ((v >> 1) & (T) ~(T)0 / 3); + v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3); + v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15; + return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8; +} +inline int LeastSignificantSetBit(u8 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +inline int LeastSignificantSetBit(u16 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +inline int LeastSignificantSetBit(u32 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +inline int LeastSignificantSetBit(u64 val) +{ + unsigned long index; + _BitScanForward64(&index, val); + return (int)index; +} +#else +namespace Common +{ +constexpr int CountSetBits(u8 val) +{ + return __builtin_popcount(val); +} +constexpr int CountSetBits(u16 val) +{ + return __builtin_popcount(val); +} +constexpr int CountSetBits(u32 val) +{ + return __builtin_popcount(val); +} +constexpr int CountSetBits(u64 val) +{ + return __builtin_popcountll(val); +} +inline int LeastSignificantSetBit(u8 val) +{ + return __builtin_ctz(val); +} +inline int LeastSignificantSetBit(u16 val) +{ + return __builtin_ctz(val); +} +inline int LeastSignificantSetBit(u32 val) +{ + return __builtin_ctz(val); +} +inline int LeastSignificantSetBit(u64 val) +{ + return __builtin_ctzll(val); +} +#endif + +// Similar to std::bitset, this is a class which encapsulates a bitset, i.e. +// using the set bits of an integer to represent a set of integers. Like that +// class, it acts like an array of bools: +// BitSet32 bs; +// bs[1] = true; +// but also like the underlying integer ([0] = least significant bit): +// BitSet32 bs2 = ...; +// bs = (bs ^ bs2) & BitSet32(0xffff); +// The following additional functionality is provided: +// - Construction using an initializer list. +// BitSet bs { 1, 2, 4, 8 }; +// - Efficiently iterating through the set bits: +// for (int i : bs) +// [i is the *index* of a set bit] +// (This uses the appropriate CPU instruction to find the next set bit in one +// operation.) +// - Counting set bits using .Count() - see comment on that method. + +// TODO: use constexpr when MSVC gets out of the Dark Ages + +template <typename IntTy> +class BitSet +{ + static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types"); + +public: + // A reference to a particular bit, returned from operator[]. + class Ref + { + public: + constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} + constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} + constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; } + bool operator=(bool set) + { + m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); + return set; + } + + private: + BitSet* m_bs; + IntTy m_mask; + }; + + // A STL-like iterator is required to be able to use range-based for loops. + class Iterator + { + public: + constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} + constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} + Iterator& operator=(Iterator other) + { + new (this) Iterator(other); + return *this; + } + Iterator& operator++() + { + if (m_val == 0) + { + m_bit = -1; + } + else + { + int bit = LeastSignificantSetBit(m_val); + m_val &= ~(1 << bit); + m_bit = bit; + } + return *this; + } + Iterator operator++(int) + { + Iterator other(*this); + ++*this; + return other; + } + constexpr int operator*() const { return m_bit; } + constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; } + constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; } + + private: + IntTy m_val; + int m_bit; + }; + + constexpr BitSet() : m_val(0) {} + constexpr explicit BitSet(IntTy val) : m_val(val) {} + BitSet(std::initializer_list<int> init) + { + m_val = 0; + for (int bit : init) + m_val |= (IntTy)1 << bit; + } + + constexpr static BitSet AllTrue(size_t count) + { + return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1)); + } + + Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } + constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; } + constexpr bool operator==(BitSet other) const { return m_val == other.m_val; } + constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; } + constexpr bool operator<(BitSet other) const { return m_val < other.m_val; } + constexpr bool operator>(BitSet other) const { return m_val > other.m_val; } + constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } + constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } + constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } + constexpr BitSet operator~() const { return BitSet(~m_val); } + constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); } + constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); } + constexpr explicit operator bool() const { return m_val != 0; } + BitSet& operator|=(BitSet other) { return *this = *this | other; } + BitSet& operator&=(BitSet other) { return *this = *this & other; } + BitSet& operator^=(BitSet other) { return *this = *this ^ other; } + BitSet& operator<<=(IntTy shift) { return *this = *this << shift; } + BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; } + // Warning: Even though on modern CPUs this is a single fast instruction, + // Dolphin's official builds do not currently assume POPCNT support on x86, + // so slower explicit bit twiddling is generated. Still should generally + // be faster than a loop. + constexpr unsigned int Count() const { return CountSetBits(m_val); } + constexpr Iterator begin() const { return ++Iterator(m_val, 0); } + constexpr Iterator end() const { return Iterator(m_val, -1); } + IntTy m_val; +}; +} // namespace Common + +using BitSet8 = Common::BitSet<u8>; +using BitSet16 = Common::BitSet<u16>; +using BitSet32 = Common::BitSet<u32>; +using BitSet64 = Common::BitSet<u64>; diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h new file mode 100644 index 0000000..8b64a92 --- /dev/null +++ b/src/dolphin/BitUtils.h @@ -0,0 +1,254 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include <climits> +#include <cstddef> +#include <cstring> +#include <type_traits> + +namespace Common +{ +/// +/// Retrieves the size of a type in bits. +/// +/// @tparam T Type to get the size of. +/// +/// @return the size of the type in bits. +/// +template <typename T> +constexpr size_t BitSize() noexcept +{ + return sizeof(T) * CHAR_BIT; +} + +/// +/// Extracts a bit from a value. +/// +/// @param src The value to extract a bit from. +/// @param bit The bit to extract. +/// +/// @tparam T The type of the value. +/// +/// @return The extracted bit. +/// +template <typename T> +constexpr T ExtractBit(const T src, const size_t bit) noexcept +{ + return (src >> bit) & static_cast<T>(1); +} + +/// +/// Extracts a bit from a value. +/// +/// @param src The value to extract a bit from. +/// +/// @tparam bit The bit to extract. +/// @tparam T The type of the value. +/// +/// @return The extracted bit. +/// +template <size_t bit, typename T> +constexpr T ExtractBit(const T src) noexcept +{ + static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width."); + + return ExtractBit(src, bit); +} + +/// +/// Extracts a range of bits from a value. +/// +/// @param src The value to extract the bits from. +/// @param begin The beginning of the bit range. This is inclusive. +/// @param end The ending of the bit range. This is inclusive. +/// +/// @tparam T The type of the value. +/// @tparam Result The returned result type. This is the unsigned analog +/// of a signed type if a signed type is passed as T. +/// +/// @return The extracted bits. +/// +template <typename T, typename Result = std::make_unsigned_t<T>> +constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept +{ + return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >> + (BitSize<T>() - end + begin - 1))); +} + +/// +/// Extracts a range of bits from a value. +/// +/// @param src The value to extract the bits from. +/// +/// @tparam begin The beginning of the bit range. This is inclusive. +/// @tparam end The ending of the bit range. This is inclusive. +/// @tparam T The type of the value. +/// @tparam Result The returned result type. This is the unsigned analog +/// of a signed type if a signed type is passed as T. +/// +/// @return The extracted bits. +/// +template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>> +constexpr Result ExtractBits(const T src) noexcept +{ + static_assert(begin < end, "Beginning bit must be less than the ending bit."); + static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width."); + static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width."); + + return ExtractBits<T, Result>(src, begin, end); +} + +/// +/// Rotates a value left (ROL). +/// +/// @param value The value to rotate. +/// @param amount The number of bits to rotate the value. +/// @tparam T An unsigned type. +/// +/// @return The rotated value. +/// +template <typename T> +constexpr T RotateLeft(const T value, size_t amount) noexcept +{ + static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left."); + + amount %= BitSize<T>(); + + if (amount == 0) + return value; + + return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount))); +} + +/// +/// Rotates a value right (ROR). +/// +/// @param value The value to rotate. +/// @param amount The number of bits to rotate the value. +/// @tparam T An unsigned type. +/// +/// @return The rotated value. +/// +template <typename T> +constexpr T RotateRight(const T value, size_t amount) noexcept +{ + static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right."); + + amount %= BitSize<T>(); + + if (amount == 0) + return value; + + return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount))); +} + +/// +/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11. +/// Both edge cases of all zeros and all ones are considered valid masks, too. +/// +/// @param mask The mask value to test for validity. +/// +/// @tparam T The type of the value. +/// +/// @return A bool indicating whether the mask is valid. +/// +template <typename T> +constexpr bool IsValidLowMask(const T mask) noexcept +{ + static_assert(std::is_integral<T>::value, "Mask must be an integral type."); + static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs."); + + // Can be efficiently determined without looping or bit counting. It's the counterpart + // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 + // and doesn't require special casing either edge case. + return (mask & (mask + 1)) == 0; +} + +/// +/// Reinterpret objects of one type as another by bit-casting between object representations. +/// +/// @remark This is the example implementation of std::bit_cast which is to be included +/// in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html +/// for more details. The only difference is this variant is not constexpr, +/// as the mechanism for bit_cast requires a compiler built-in to have that quality. +/// +/// @param source The source object to convert to another representation. +/// +/// @tparam To The type to reinterpret source as. +/// @tparam From The initial type representation of source. +/// +/// @return The representation of type From as type To. +/// +/// @pre Both To and From types must be the same size +/// @pre Both To and From types must satisfy the TriviallyCopyable concept. +/// +template <typename To, typename From> +inline To BitCast(const From& source) noexcept +{ + static_assert(sizeof(From) == sizeof(To), + "BitCast source and destination types must be equal in size."); + static_assert(std::is_trivially_copyable<From>(), + "BitCast source type must be trivially copyable."); + static_assert(std::is_trivially_copyable<To>(), + "BitCast destination type must be trivially copyable."); + + std::aligned_storage_t<sizeof(To), alignof(To)> storage; + std::memcpy(&storage, &source, sizeof(storage)); + return reinterpret_cast<To&>(storage); +} + +template <typename T, typename PtrType> +class BitCastPtrType +{ +public: + static_assert(std::is_trivially_copyable<PtrType>(), + "BitCastPtr source type must be trivially copyable."); + static_assert(std::is_trivially_copyable<T>(), + "BitCastPtr destination type must be trivially copyable."); + + explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {} + + // Enable operator= only for pointers to non-const data + template <typename S> + inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type + operator=(const S& source) + { + std::memcpy(m_ptr, &source, sizeof(source)); + } + + inline operator T() const + { + T result; + std::memcpy(&result, m_ptr, sizeof(result)); + return result; + } + +private: + PtrType* m_ptr; +}; + +// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs +// Conversion constructor and operator= provided for a convenient syntax. +// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr); +// BitCastPtr<MyStruct>(some_ptr) = s; +template <typename T, typename PtrType> +inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType> +{ + return BitCastPtrType<T, PtrType>{ptr}; +} + +template <typename T> +void SetBit(T& value, size_t bit_number, bool bit_value) +{ + static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types."); + + if (bit_value) + value |= (T{1} << bit_number); + else + value &= ~(T{1} << bit_number); +} + +} // namespace Common diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h new file mode 100644 index 0000000..bd4fd8d --- /dev/null +++ b/src/dolphin/CPUDetect.h @@ -0,0 +1,76 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +// Detect the CPU, so we'll know which optimizations to use +#pragma once + +#include <string> + +enum class CPUVendor +{ + Intel, + AMD, + ARM, + Other, +}; + +struct CPUInfo +{ + CPUVendor vendor = CPUVendor::Intel; + + char cpu_string[0x41] = {}; + char brand_string[0x21] = {}; + bool OS64bit = false; + bool CPU64bit = false; + bool Mode64bit = false; + + bool HTT = false; + int num_cores = 0; + int logical_cpu_count = 0; + + bool bSSE = false; + bool bSSE2 = false; + bool bSSE3 = false; + bool bSSSE3 = false; + bool bPOPCNT = false; + bool bSSE4_1 = false; + bool bSSE4_2 = false; + bool bLZCNT = false; + bool bSSE4A = false; + bool bAVX = false; + bool bAVX2 = false; + bool bBMI1 = false; + bool bBMI2 = false; + bool bFMA = false; + bool bFMA4 = false; + bool bAES = false; + // FXSAVE/FXRSTOR + bool bFXSR = false; + bool bMOVBE = false; + // This flag indicates that the hardware supports some mode + // in which denormal inputs _and_ outputs are automatically set to (signed) zero. + bool bFlushToZero = false; + bool bLAHFSAHF64 = false; + bool bLongMode = false; + bool bAtom = false; + + // ARMv8 specific + bool bFP = false; + bool bASIMD = false; + bool bCRC32 = false; + bool bSHA1 = false; + bool bSHA2 = false; + + // Call Detect() + explicit CPUInfo(); + + // Turn the CPU info into a string we can show + std::string Summarize(); + +private: + // Detects the various CPU features + void Detect(); +}; + +extern CPUInfo cpu_info; diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp new file mode 100644 index 0000000..f85051d --- /dev/null +++ b/src/dolphin/CommonFuncs.cpp @@ -0,0 +1,52 @@ +// Copyright 2009 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#include <cstddef> +#include <cstring> +#include <errno.h> +#include <type_traits> + +#include "CommonFuncs.h" + +#ifdef _WIN32 +#include <windows.h> +#define strerror_r(err, buf, len) strerror_s(buf, len, err) +#endif + +constexpr size_t BUFFER_SIZE = 256; + +// Wrapper function to get last strerror(errno) string. +// This function might change the error code. +std::string LastStrerrorString() +{ + char error_message[BUFFER_SIZE]; + + // There are two variants of strerror_r. The XSI version stores the message to the passed-in + // buffer and returns an int (0 on success). The GNU version returns a pointer to the message, + // which might have been stored in the passed-in buffer or might be a static string. + + // We check defines in order to figure out variant is in use, and we store the returned value + // to a variable so that we'll get a compile-time check that our assumption was correct. + +#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600)) + const char* str = strerror_r(errno, error_message, BUFFER_SIZE); + return std::string(str); +#else + int error_code = strerror_r(errno, error_message, BUFFER_SIZE); + return error_code == 0 ? std::string(error_message) : ""; +#endif +} + +#ifdef _WIN32 +// Wrapper function to get GetLastError() string. +// This function might change the error code. +std::string GetLastErrorString() +{ + char error_message[BUFFER_SIZE]; + + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(), + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr); + return std::string(error_message); +} +#endif diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h new file mode 100644 index 0000000..708fbc3 --- /dev/null +++ b/src/dolphin/CommonFuncs.h @@ -0,0 +1,58 @@ +// Copyright 2009 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#pragma once + +#include <cstddef> +#include <string> +#include "../types.h" + +// Will fail to compile on a non-array: +template <typename T, size_t N> +constexpr size_t ArraySize(T (&arr)[N]) +{ + return N; +} + +#ifndef _WIN32 + +// go to debugger mode +#define Crash() \ + { \ + __builtin_trap(); \ + } + +#else // WIN32 +// Function Cross-Compatibility +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define unlink _unlink +#define vscprintf _vscprintf + +// 64 bit offsets for Windows +#define fseeko _fseeki64 +#define ftello _ftelli64 +#define atoll _atoi64 +#define stat _stat64 +#define fstat _fstat64 +#define fileno _fileno + +extern "C" { +__declspec(dllimport) void __stdcall DebugBreak(void); +} +#define Crash() \ + { \ + DebugBreak(); \ + } +#endif // WIN32 ndef + +// Wrapper function to get last strerror(errno) string. +// This function might change the error code. +std::string LastStrerrorString(); + +#ifdef _WIN32 +// Wrapper function to get GetLastError() string. +// This function might change the error code. +std::string GetLastErrorString(); +#endif diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h new file mode 100644 index 0000000..787d505 --- /dev/null +++ b/src/dolphin/Compat.h @@ -0,0 +1,75 @@ +// Stubs for Assert.h and Log.h +#pragma once + +#include <assert.h> + +// Assert stub +#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \ + assert(_a_) \ + /*do \ + { \ + if (!(_a_)) \ + { \ + if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \ + assert(_a_); \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \ + { \ + ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \ + if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + ASSERT_MSG(MASTER_LOG, _a_, \ + _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \ + __LINE__, __FILE__); \ + } while (0)*/ + +#define DEBUG_ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \ + ASSERT(_a_); \ + } while (0)*/ + +// Log Stub +#include <cstdio> + +#define PanicAlert(fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + abort(); \ + } while (false) + +#define DYNA_REC 0 + +#define ERROR_LOG(which, fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + } while (false) + +#if __cplusplus < 201703L +// cheat +namespace std +{ +template <typename T> +T clamp(const T& v, const T& lo, const T& hi) +{ + return v < lo ? lo : (v > hi ? hi : v); +} +} +#endif
\ No newline at end of file diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp new file mode 100644 index 0000000..70f2ede --- /dev/null +++ b/src/dolphin/MathUtil.cpp @@ -0,0 +1,13 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "MathUtil.h" + +#include <numeric> + +// Calculate sum of a float list +float MathFloatVectorSum(const std::vector<float>& Vec) +{ + return std::accumulate(Vec.begin(), Vec.end(), 0.0f); +} diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h new file mode 100644 index 0000000..b1dbbae --- /dev/null +++ b/src/dolphin/MathUtil.h @@ -0,0 +1,121 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <vector> + +#include "Compat.h" + +#include "../types.h" + +#ifdef _MSC_VER +#include <intrin.h> +#endif + +namespace MathUtil +{ +constexpr double TAU = 6.2831853071795865; +constexpr double PI = TAU / 2; + +template <typename T> +constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{})) +{ + return (T{} < val) - (val < T{}); +} + +template <typename T, typename F> +constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a) +{ + return x + (y - x) * a; +} + +template <typename T> +constexpr bool IsPow2(T imm) +{ + return imm > 0 && (imm & (imm - 1)) == 0; +} + +constexpr u32 NextPowerOf2(u32 value) +{ + --value; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + ++value; + + return value; +} + +template <class T> +struct Rectangle +{ + T left{}; + T top{}; + T right{}; + T bottom{}; + + constexpr Rectangle() = default; + + constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom) + : left(theLeft), top(theTop), right(theRight), bottom(theBottom) + { + } + + constexpr bool operator==(const Rectangle& r) const + { + return left == r.left && top == r.top && right == r.right && bottom == r.bottom; + } + + T GetWidth() const { return abs(right - left); } + T GetHeight() const { return abs(bottom - top); } + // If the rectangle is in a coordinate system with a lower-left origin, use + // this Clamp. + void ClampLL(T x1, T y1, T x2, T y2) + { + left = std::clamp(left, x1, x2); + right = std::clamp(right, x1, x2); + top = std::clamp(top, y2, y1); + bottom = std::clamp(bottom, y2, y1); + } + + // If the rectangle is in a coordinate system with an upper-left origin, + // use this Clamp. + void ClampUL(T x1, T y1, T x2, T y2) + { + left = std::clamp(left, x1, x2); + right = std::clamp(right, x1, x2); + top = std::clamp(top, y1, y2); + bottom = std::clamp(bottom, y1, y2); + } +}; + +} // namespace MathUtil + +float MathFloatVectorSum(const std::vector<float>&); + +// Rounds down. 0 -> undefined +inline int IntLog2(u64 val) +{ +#if defined(__GNUC__) + return 63 - __builtin_clzll(val); + +#elif defined(_MSC_VER) + unsigned long result = ULONG_MAX; + _BitScanReverse64(&result, val); + return result; + +#else + int result = -1; + while (val != 0) + { + val >>= 1; + ++result; + } + return result; +#endif +} diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/src/dolphin/license_dolphin.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp new file mode 100644 index 0000000..d86a158 --- /dev/null +++ b/src/dolphin/x64ABI.cpp @@ -0,0 +1,119 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#include "../types.h" +#include "x64ABI.h" +#include "x64Emitter.h" + +using namespace Gen; + +// Shared code between Win64 and Unix64 + +void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, + size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) +{ + size_t shadow = 0; +#if defined(_WIN32) + shadow = 0x20; +#endif + + int count = (mask & ABI_ALL_GPRS).Count(); + rsp_alignment -= count * 8; + size_t subtraction = 0; + int fpr_count = (mask & ABI_ALL_FPRS).Count(); + if (fpr_count) + { + // If we have any XMMs to save, we must align the stack here. + subtraction = rsp_alignment & 0xf; + } + subtraction += 16 * fpr_count; + size_t xmm_base_subtraction = subtraction; + subtraction += needed_frame_size; + subtraction += shadow; + // Final alignment. + rsp_alignment -= subtraction; + subtraction += rsp_alignment & 0xf; + + *shadowp = shadow; + *subtractionp = subtraction; + *xmm_offsetp = subtraction - xmm_base_subtraction; +} + +size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, + size_t needed_frame_size) +{ + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, + &xmm_offset); + + for (int r : mask& ABI_ALL_GPRS) + PUSH((X64Reg)r); + + if (subtraction) + SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); + + for (int x : mask& ABI_ALL_FPRS) + { + MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16)); + xmm_offset += 16; + } + + return shadow; +} + +void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, + size_t needed_frame_size) +{ + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, + &xmm_offset); + + for (int x : mask& ABI_ALL_FPRS) + { + MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset)); + xmm_offset += 16; + } + + if (subtraction) + ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); + + for (int r = 15; r >= 0; r--) + { + if (mask[r]) + POP((X64Reg)r); + } +} + +void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2, + Gen::X64Reg src2) +{ + if (dst1 == src2 && dst2 == src1) + { + XCHG(bits, R(src1), R(src2)); + if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); + } + else if (src2 != dst1) + { + if (dst1 != src1 && offset1) + LEA(bits, dst1, MDisp(src1, offset1)); + else if (dst1 != src1) + MOV(bits, R(dst1), R(src1)); + else if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); + if (dst2 != src2) + MOV(bits, R(dst2), R(src2)); + } + else + { + if (dst2 != src2) + MOV(bits, R(dst2), R(src2)); + if (dst1 != src1 && offset1) + LEA(bits, dst1, MDisp(src1, offset1)); + else if (dst1 != src1) + MOV(bits, R(dst1), R(src1)); + else if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); + } +} diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h new file mode 100644 index 0000000..94336d0 --- /dev/null +++ b/src/dolphin/x64ABI.h @@ -0,0 +1,58 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#pragma once + +#include "BitSet.h" +#include "x64Reg.h" + +// x64 ABI:s, and helpers to help follow them when JIT-ing code. +// All convensions return values in EAX (+ possibly EDX). + +// Windows 64-bit +// * 4-reg "fastcall" variant, very new-skool stack handling +// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself +// calls_ +// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space. +// Scratch: RAX RCX RDX R8 R9 R10 R11 +// Callee-save: RBX RSI RDI RBP R12 R13 R14 R15 +// Parameters: RCX RDX R8 R9, further MOV-ed + +// Linux 64-bit +// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed) +// Scratch: RAX RCX RDX RSI RDI R8 R9 R10 R11 +// Callee-save: RBX RBP R12 R13 R14 R15 +// Parameters: RDI RSI RDX RCX R8 R9 + +#define ABI_ALL_FPRS BitSet32(0xffff0000) +#define ABI_ALL_GPRS BitSet32(0x0000ffff) + +#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention + +#define ABI_PARAM1 RCX +#define ABI_PARAM2 RDX +#define ABI_PARAM3 R8 +#define ABI_PARAM4 R9 + +// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. +#define ABI_ALL_CALLER_SAVED \ + (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16, \ + XMM4 + 16, XMM5 + 16}) +#else // 64-bit Unix / OS X + +#define ABI_PARAM1 RDI +#define ABI_PARAM2 RSI +#define ABI_PARAM3 RDX +#define ABI_PARAM4 RCX +#define ABI_PARAM5 R8 +#define ABI_PARAM6 R9 + +// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably +// don't actually clobber them. +#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS) +#endif // WIN32 + +#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED) + +#define ABI_RETURN RAX diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp new file mode 100644 index 0000000..49b51c9 --- /dev/null +++ b/src/dolphin/x64CPUDetect.cpp @@ -0,0 +1,273 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#include <cstring> +#include <string> + +#include "CPUDetect.h" +#include "../types.h" + +#ifndef _MSVC_VER + +#ifdef __FreeBSD__ +#include <unistd.h> + +#include <machine/cpufunc.h> +#include <sys/types.h> +#endif + +static inline void __cpuidex(int info[4], int function_id, int subfunction_id) +{ +#ifdef __FreeBSD__ + // Despite the name, this is just do_cpuid() with ECX as second input. + cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info); +#else + info[0] = function_id; // eax + info[2] = subfunction_id; // ecx + __asm__("cpuid" + : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) + : "a"(function_id), "c"(subfunction_id)); +#endif +} + +static inline void __cpuid(int info[4], int function_id) +{ + return __cpuidex(info, function_id, 0); +} + +#endif // ifndef _WIN32 + +#ifdef _MSVC_VER + +static u64 xgetbv(u32 index) +{ + return _xgetbv(index); +} +constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK; + +#else + +static u64 xgetbv(u32 index) +{ + u32 eax, edx; + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((u64)edx << 32) | eax; +} +constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0; +#endif // ifdef _WIN32 + +CPUInfo cpu_info; + +CPUInfo::CPUInfo() +{ + Detect(); +} + +// Detects the various CPU features +void CPUInfo::Detect() +{ +#ifdef _M_X86_64 + Mode64bit = true; + OS64bit = true; +#endif + num_cores = 1; + + // Set obvious defaults, for extra safety + if (Mode64bit) + { + bSSE = true; + bSSE2 = true; + bLongMode = true; + } + + // Assume CPU supports the CPUID instruction. Those that don't can barely + // boot modern OS:es anyway. + int cpu_id[4]; + + // Detect CPU's CPUID capabilities, and grab CPU string + __cpuid(cpu_id, 0x00000000); + u32 max_std_fn = cpu_id[0]; // EAX + std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int)); + std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int)); + std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int)); + __cpuid(cpu_id, 0x80000000); + u32 max_ex_fn = cpu_id[0]; + if (!strcmp(brand_string, "GenuineIntel")) + vendor = CPUVendor::Intel; + else if (!strcmp(brand_string, "AuthenticAMD")) + vendor = CPUVendor::AMD; + else + vendor = CPUVendor::Other; + + // Set reasonable default brand string even if brand string not available. + strcpy(cpu_string, brand_string); + + // Detect family and other misc stuff. + bool ht = false; + HTT = ht; + logical_cpu_count = 1; + if (max_std_fn >= 1) + { + __cpuid(cpu_id, 0x00000001); + int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff); + int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0); + // Detect people unfortunate enough to be running Dolphin on an Atom + if (family == 6 && + (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 || + model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D)) + bAtom = true; + logical_cpu_count = (cpu_id[1] >> 16) & 0xFF; + ht = (cpu_id[3] >> 28) & 1; + + if ((cpu_id[3] >> 25) & 1) + bSSE = true; + if ((cpu_id[3] >> 26) & 1) + bSSE2 = true; + if ((cpu_id[2]) & 1) + bSSE3 = true; + if ((cpu_id[2] >> 9) & 1) + bSSSE3 = true; + if ((cpu_id[2] >> 19) & 1) + bSSE4_1 = true; + if ((cpu_id[2] >> 20) & 1) + bSSE4_2 = true; + if ((cpu_id[2] >> 22) & 1) + bMOVBE = true; + if ((cpu_id[2] >> 25) & 1) + bAES = true; + + if ((cpu_id[3] >> 24) & 1) + { + // We can use FXSAVE. + bFXSR = true; + } + + // AVX support requires 3 separate checks: + // - Is the AVX bit set in CPUID? + // - Is the XSAVE bit set in CPUID? + // - XGETBV result has the XCR bit set. + if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) + { + if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) + { + bAVX = true; + if ((cpu_id[2] >> 12) & 1) + bFMA = true; + } + } + + if (max_std_fn >= 7) + { + __cpuidex(cpu_id, 0x00000007, 0x00000000); + // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed + if ((cpu_id[1] >> 5) & 1) + bAVX2 = bAVX; + if ((cpu_id[1] >> 3) & 1) + bBMI1 = true; + if ((cpu_id[1] >> 8) & 1) + bBMI2 = true; + } + } + + bFlushToZero = bSSE; + + if (max_ex_fn >= 0x80000004) + { + // Extract CPU model string + __cpuid(cpu_id, 0x80000002); + memcpy(cpu_string, cpu_id, sizeof(cpu_id)); + __cpuid(cpu_id, 0x80000003); + memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id)); + __cpuid(cpu_id, 0x80000004); + memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id)); + } + if (max_ex_fn >= 0x80000001) + { + // Check for more features. + __cpuid(cpu_id, 0x80000001); + if (cpu_id[2] & 1) + bLAHFSAHF64 = true; + if ((cpu_id[2] >> 5) & 1) + bLZCNT = true; + if ((cpu_id[2] >> 16) & 1) + bFMA4 = true; + if ((cpu_id[3] >> 29) & 1) + bLongMode = true; + } + + num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count; + + if (max_ex_fn >= 0x80000008) + { + // Get number of cores. This is a bit complicated. Following AMD manual here. + __cpuid(cpu_id, 0x80000008); + int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF; + if (apic_id_core_id_size == 0) + { + if (ht) + { + // New mechanism for modern Intel CPUs. + if (vendor == CPUVendor::Intel) + { + __cpuidex(cpu_id, 0x00000004, 0x00000000); + int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1; + HTT = (cores_x_package < logical_cpu_count); + cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1; + num_cores = (cores_x_package > 1) ? cores_x_package : num_cores; + logical_cpu_count /= cores_x_package; + } + } + } + else + { + // Use AMD's new method. + num_cores = (cpu_id[2] & 0xFF) + 1; + } + } +} + +// Turn the CPU info into a string we can show +std::string CPUInfo::Summarize() +{ + std::string sum(cpu_string); + sum += " ("; + sum += brand_string; + sum += ")"; + + if (bSSE) + sum += ", SSE"; + if (bSSE2) + { + sum += ", SSE2"; + if (!bFlushToZero) + sum += " (but not DAZ!)"; + } + if (bSSE3) + sum += ", SSE3"; + if (bSSSE3) + sum += ", SSSE3"; + if (bSSE4_1) + sum += ", SSE4.1"; + if (bSSE4_2) + sum += ", SSE4.2"; + if (HTT) + sum += ", HTT"; + if (bAVX) + sum += ", AVX"; + if (bAVX2) + sum += ", AVX2"; + if (bBMI1) + sum += ", BMI1"; + if (bBMI2) + sum += ", BMI2"; + if (bFMA) + sum += ", FMA"; + if (bAES) + sum += ", AES"; + if (bMOVBE) + sum += ", MOVBE"; + if (bLongMode) + sum += ", 64-bit support"; + return sum; +} diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp new file mode 100644 index 0000000..343f314 --- /dev/null +++ b/src/dolphin/x64Emitter.cpp @@ -0,0 +1,3399 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#include <cinttypes> +#include <cstring> + +#include "CPUDetect.h" +#include "../types.h" +#include "x64Emitter.h" +#include "x64Reg.h" +#include "Compat.h" +#include "CommonFuncs.h" + +namespace Gen +{ +// TODO(ector): Add EAX special casing, for ever so slightly smaller code. +struct NormalOpDef +{ + u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext; +}; + +// 0xCC is code for invalid combination of immediates +static const NormalOpDef normalops[11] = { + {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, // ADD + {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, // ADC + + {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, // SUB + {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, // SBB + + {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, // AND + {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, // OR + + {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, // XOR + {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, // MOV + + {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, // TEST (to == from) + {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, // CMP + + {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, // XCHG +}; + +enum NormalSSEOps +{ + sseCMP = 0xC2, + sseADD = 0x58, // ADD + sseSUB = 0x5C, // SUB + sseAND = 0x54, // AND + sseANDN = 0x55, // ANDN + sseOR = 0x56, + sseXOR = 0x57, + sseMUL = 0x59, // MUL + sseDIV = 0x5E, // DIV + sseMIN = 0x5D, // MIN + sseMAX = 0x5F, // MAX + sseCOMIS = 0x2F, // COMIS + sseUCOMIS = 0x2E, // UCOMIS + sseSQRT = 0x51, // SQRT + sseRCP = 0x53, // RCP + sseRSQRT = 0x52, // RSQRT (NO DOUBLE PRECISION!!!) + sseMOVAPfromRM = 0x28, // MOVAP from RM + sseMOVAPtoRM = 0x29, // MOVAP to RM + sseMOVUPfromRM = 0x10, // MOVUP from RM + sseMOVUPtoRM = 0x11, // MOVUP to RM + sseMOVLPfromRM = 0x12, + sseMOVLPtoRM = 0x13, + sseMOVHPfromRM = 0x16, + sseMOVHPtoRM = 0x17, + sseMOVHLPS = 0x12, + sseMOVLHPS = 0x16, + sseMOVDQfromRM = 0x6F, + sseMOVDQtoRM = 0x7F, + sseMASKMOVDQU = 0xF7, + sseLDDQU = 0xF0, + sseSHUF = 0xC6, + sseMOVNTDQ = 0xE7, + sseMOVNTP = 0x2B, +}; + +enum class NormalOp +{ + ADD, + ADC, + SUB, + SBB, + AND, + OR, + XOR, + MOV, + TEST, + CMP, + XCHG, +}; + +enum class FloatOp +{ + LD = 0, + ST = 2, + STP = 3, + LD80 = 5, + STP80 = 7, + + Invalid = -1, +}; + +void XEmitter::SetCodePtr(u8* ptr) +{ + code = ptr; +} + +const u8* XEmitter::GetCodePtr() const +{ + return code; +} + +u8* XEmitter::GetWritableCodePtr() +{ + return code; +} + +void XEmitter::Write8(u8 value) +{ + *code++ = value; +} + +void XEmitter::Write16(u16 value) +{ + std::memcpy(code, &value, sizeof(u16)); + code += sizeof(u16); +} + +void XEmitter::Write32(u32 value) +{ + std::memcpy(code, &value, sizeof(u32)); + code += sizeof(u32); +} + +void XEmitter::Write64(u64 value) +{ + std::memcpy(code, &value, sizeof(u64)); + code += sizeof(u64); +} + +void XEmitter::ReserveCodeSpace(int bytes) +{ + for (int i = 0; i < bytes; i++) + *code++ = 0xCC; +} + +u8* XEmitter::AlignCodeTo(size_t alignment) +{ + ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0, + "Alignment must be power of two"); + u64 c = reinterpret_cast<u64>(code) & (alignment - 1); + if (c) + ReserveCodeSpace(static_cast<int>(alignment - c)); + return code; +} + +u8* XEmitter::AlignCode4() +{ + return AlignCodeTo(4); +} + +u8* XEmitter::AlignCode16() +{ + return AlignCodeTo(16); +} + +u8* XEmitter::AlignCodePage() +{ + return AlignCodeTo(4096); +} + +// This operation modifies flags; check to see the flags are locked. +// If the flags are locked, we should immediately and loudly fail before +// causing a subtle JIT bug. +void XEmitter::CheckFlags() +{ + ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!"); +} + +void XEmitter::WriteModRM(int mod, int reg, int rm) +{ + Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7))); +} + +void XEmitter::WriteSIB(int scale, int index, int base) +{ + Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7))); +} + +void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const +{ + if (customOp == -1) + customOp = operandReg; + u8 op = 0x40; + // REX.W (whether operation is a 64-bit operation) + if (opBits == 64) + op |= 8; + // REX.R (whether ModR/M reg field refers to R8-R15. + if (customOp & 8) + op |= 4; + // REX.X (whether ModR/M SIB index field refers to R8-R15) + if (indexReg & 8) + op |= 2; + // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15) + if (offsetOrBaseReg & 8) + op |= 1; + // Write REX if wr have REX bits to write, or if the operation accesses + // SIL, DIL, BPL, or SPL. + if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) || + (opBits == 8 && (customOp & 0x10c) == 4)) + { + emit->Write8(op); + // Check the operation doesn't access AH, BH, CH, or DH. + DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0); + DEBUG_ASSERT((customOp & 0x100) == 0); + } +} + +void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, + int W) const +{ + int R = !(regOp1 & 8); + int X = !(indexReg & 8); + int B = !(offsetOrBaseReg & 8); + + int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf); + + // do we need any VEX fields that only appear in the three-byte form? + if (X == 1 && B == 1 && W == 0 && mmmmm == 1) + { + u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp; + emit->Write8(0xC5); + emit->Write8(RvvvvLpp); + } + else + { + u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm; + u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp; + emit->Write8(0xC4); + emit->Write8(RXBmmmmm); + emit->Write8(WvvvvLpp); + } +} + +void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg, + bool warn_64bit_offset) const +{ + if (_operandReg == INVALID_REG) + _operandReg = (X64Reg)this->operandReg; + int mod = 0; + int ireg = indexReg; + bool SIB = false; + int _offsetOrBaseReg = this->offsetOrBaseReg; + + if (scale == SCALE_RIP) // Also, on 32-bit, just an immediate address + { + // Oh, RIP addressing. + _offsetOrBaseReg = 5; + emit->WriteModRM(0, _operandReg, _offsetOrBaseReg); + // TODO : add some checks + u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; + s64 distance = (s64)offset - (s64)ripAddr; + ASSERT_MSG(DYNA_REC, + (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset, + "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset); + s32 offs = (s32)distance; + emit->Write32((u32)offs); + return; + } + + if (scale == 0) + { + // Oh, no memory, Just a reg. + mod = 3; // 11 + } + else + { + // Ah good, no scaling. + if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5)) + { + // Okay, we're good. No SIB necessary. + int ioff = (int)offset; + if (ioff == 0) + { + mod = 0; + } + else if (ioff < -128 || ioff > 127) + { + mod = 2; // 32-bit displacement + } + else + { + mod = 1; // 8-bit displacement + } + } + else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8) + { + SIB = true; + mod = 0; + _offsetOrBaseReg = 5; + } + else + { + if ((_offsetOrBaseReg & 7) == 4) // this would occupy the SIB encoding :( + { + // So we have to fake it with SIB encoding :( + SIB = true; + } + + if (scale >= SCALE_1 && scale < SCALE_ATREG) + { + SIB = true; + } + + if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4)) + { + SIB = true; + ireg = _offsetOrBaseReg; + } + + // Okay, we're fine. Just disp encoding. + // We need displacement. Which size? + int ioff = (int)(s64)offset; + if (ioff < -128 || ioff > 127) + { + mod = 2; // 32-bit displacement + } + else + { + mod = 1; // 8-bit displacement + } + } + } + + // Okay. Time to do the actual writing + // ModRM byte: + int oreg = _offsetOrBaseReg; + if (SIB) + oreg = 4; + + emit->WriteModRM(mod, _operandReg & 7, oreg & 7); + + if (SIB) + { + // SIB byte + int ss; + switch (scale) + { + case SCALE_NONE: + _offsetOrBaseReg = 4; + ss = 0; + break; // RSP + case SCALE_1: + ss = 0; + break; + case SCALE_2: + ss = 1; + break; + case SCALE_4: + ss = 2; + break; + case SCALE_8: + ss = 3; + break; + case SCALE_NOBASE_2: + ss = 1; + break; + case SCALE_NOBASE_4: + ss = 2; + break; + case SCALE_NOBASE_8: + ss = 3; + break; + case SCALE_ATREG: + ss = 0; + break; + default: + ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte"); + ss = 0; + break; + } + emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7))); + } + + if (mod == 1) // 8-bit disp + { + emit->Write8((u8)(s8)(s32)offset); + } + else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) // 32-bit disp + { + emit->Write32((u32)offset); + } +} + +// W = operand extended width (1 if 64-bit) +// R = register# upper bit +// X = scale amnt upper bit +// B = base register# upper bit +void XEmitter::Rex(int w, int r, int x, int b) +{ + w = w ? 1 : 0; + r = r ? 1 : 0; + x = x ? 1 : 0; + b = b ? 1 : 0; + u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b)); + if (rx != 0x40) + Write8(rx); +} + +void XEmitter::JMP(const u8* addr, bool force5Bytes) +{ + u64 fn = (u64)addr; + if (!force5Bytes) + { + s64 distance = (s64)(fn - ((u64)code + 2)); + ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80, + "Jump target too far away, needs force5Bytes = true"); + // 8 bits will do + Write8(0xEB); + Write8((u8)(s8)distance); + } + else + { + s64 distance = (s64)(fn - ((u64)code + 5)); + + ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL, + "Jump target too far away, needs indirect register"); + Write8(0xE9); + Write32((u32)(s32)distance); + } +} + +void XEmitter::JMPptr(const OpArg& arg2) +{ + OpArg arg = arg2; + if (arg.IsImm()) + ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument"); + arg.operandReg = 4; + arg.WriteREX(this, 0, 0); + Write8(0xFF); + arg.WriteRest(this); +} + +// Can be used to trap other processors, before overwriting their code +// not used in Dolphin +void XEmitter::JMPself() +{ + Write8(0xEB); + Write8(0xFE); +} + +void XEmitter::CALLptr(OpArg arg) +{ + if (arg.IsImm()) + ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument"); + arg.operandReg = 2; + arg.WriteREX(this, 0, 0); + Write8(0xFF); + arg.WriteRest(this); +} + +void XEmitter::CALL(const void* fnptr) +{ + u64 distance = u64(fnptr) - (u64(code) + 5); + ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL, + "CALL out of range (%p calls %p)", code, fnptr); + Write8(0xE8); + Write32(u32(distance)); +} + +FixupBranch XEmitter::CALL() +{ + FixupBranch branch; + branch.type = FixupBranch::Type::Branch32Bit; + branch.ptr = code + 5; + Write8(0xE8); + Write32(0); + return branch; +} + +FixupBranch XEmitter::J(bool force5bytes) +{ + FixupBranch branch; + branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit; + branch.ptr = code + (force5bytes ? 5 : 2); + if (!force5bytes) + { + // 8 bits will do + Write8(0xEB); + Write8(0); + } + else + { + Write8(0xE9); + Write32(0); + } + return branch; +} + +FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes) +{ + FixupBranch branch; + branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit; + branch.ptr = code + (force5bytes ? 6 : 2); + if (!force5bytes) + { + // 8 bits will do + Write8(0x70 + conditionCode); + Write8(0); + } + else + { + Write8(0x0F); + Write8(0x80 + conditionCode); + Write32(0); + } + return branch; +} + +void XEmitter::J_CC(CCFlags conditionCode, const u8* addr) +{ + u64 fn = (u64)addr; + s64 distance = (s64)(fn - ((u64)code + 2)); + if (distance < -0x80 || distance >= 0x80) + { + distance = (s64)(fn - ((u64)code + 6)); + ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL, + "Jump target too far away, needs indirect register"); + Write8(0x0F); + Write8(0x80 + conditionCode); + Write32((u32)(s32)distance); + } + else + { + Write8(0x70 + conditionCode); + Write8((u8)(s8)distance); + } +} + +void XEmitter::SetJumpTarget(const FixupBranch& branch) +{ + if (branch.type == FixupBranch::Type::Branch8Bit) + { + s64 distance = (s64)(code - branch.ptr); + if (!(distance >= -0x80 && distance < 0x80)) + { + printf("miauz\n"); + } + ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80, + "Jump target too far away, needs force5Bytes = true"); + branch.ptr[-1] = (u8)(s8)distance; + } + else if (branch.type == FixupBranch::Type::Branch32Bit) + { + s64 distance = (s64)(code - branch.ptr); + ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL, + "Jump target too far away, needs indirect register"); + + s32 valid_distance = static_cast<s32>(distance); + std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32)); + } +} + +// Single byte opcodes +// There is no PUSHAD/POPAD in 64-bit mode. +void XEmitter::INT3() +{ + Write8(0xCC); +} +void XEmitter::RET() +{ + Write8(0xC3); +} +void XEmitter::RET_FAST() +{ + Write8(0xF3); + Write8(0xC3); +} // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to + // a ret + +// The first sign of decadence: optimized NOPs. +void XEmitter::NOP(size_t size) +{ + DEBUG_ASSERT((int)size > 0); + while (true) + { + switch (size) + { + case 0: + return; + case 1: + Write8(0x90); + return; + case 2: + Write8(0x66); + Write8(0x90); + return; + case 3: + Write8(0x0F); + Write8(0x1F); + Write8(0x00); + return; + case 4: + Write8(0x0F); + Write8(0x1F); + Write8(0x40); + Write8(0x00); + return; + case 5: + Write8(0x0F); + Write8(0x1F); + Write8(0x44); + Write8(0x00); + Write8(0x00); + return; + case 6: + Write8(0x66); + Write8(0x0F); + Write8(0x1F); + Write8(0x44); + Write8(0x00); + Write8(0x00); + return; + case 7: + Write8(0x0F); + Write8(0x1F); + Write8(0x80); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + return; + case 8: + Write8(0x0F); + Write8(0x1F); + Write8(0x84); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + return; + case 9: + Write8(0x66); + Write8(0x0F); + Write8(0x1F); + Write8(0x84); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + return; + case 10: + Write8(0x66); + Write8(0x66); + Write8(0x0F); + Write8(0x1F); + Write8(0x84); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + return; + default: + // Even though x86 instructions are allowed to be up to 15 bytes long, + // AMD advises against using NOPs longer than 11 bytes because they + // carry a performance penalty on CPUs older than AMD family 16h. + Write8(0x66); + Write8(0x66); + Write8(0x66); + Write8(0x0F); + Write8(0x1F); + Write8(0x84); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + Write8(0x00); + size -= 11; + continue; + } + } +} + +void XEmitter::PAUSE() +{ + Write8(0xF3); + NOP(); +} // use in tight spinloops for energy saving on some CPU +void XEmitter::CLC() +{ + CheckFlags(); + Write8(0xF8); +} // clear carry +void XEmitter::CMC() +{ + CheckFlags(); + Write8(0xF5); +} // flip carry +void XEmitter::STC() +{ + CheckFlags(); + Write8(0xF9); +} // set carry + +// TODO: xchg ah, al ??? +void XEmitter::XCHG_AHAL() +{ + Write8(0x86); + Write8(0xe0); + // alt. 86 c4 +} + +// These two can not be executed on early Intel 64-bit CPU:s, only on AMD! +void XEmitter::LAHF() +{ + Write8(0x9F); +} +void XEmitter::SAHF() +{ + CheckFlags(); + Write8(0x9E); +} + +void XEmitter::PUSHF() +{ + Write8(0x9C); +} +void XEmitter::POPF() +{ + CheckFlags(); + Write8(0x9D); +} + +void XEmitter::LFENCE() +{ + Write8(0x0F); + Write8(0xAE); + Write8(0xE8); +} +void XEmitter::MFENCE() +{ + Write8(0x0F); + Write8(0xAE); + Write8(0xF0); +} +void XEmitter::SFENCE() +{ + Write8(0x0F); + Write8(0xAE); + Write8(0xF8); +} + +void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg) +{ + if (bits == 16) + Write8(0x66); + Rex(bits == 64, 0, 0, (int)reg >> 3); + Write8(byte + ((int)reg & 7)); +} + +void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) +{ + if (bits == 16) + Write8(0x66); + Rex(bits == 64, 0, 0, (int)reg >> 3); + Write8(byte1); + Write8(byte2 + ((int)reg & 7)); +} + +void XEmitter::CWD(int bits) +{ + if (bits == 16) + Write8(0x66); + Rex(bits == 64, 0, 0, 0); + Write8(0x99); +} + +void XEmitter::CBW(int bits) +{ + if (bits == 8) + Write8(0x66); + Rex(bits == 32, 0, 0, 0); + Write8(0x98); +} + +// Simple opcodes + +// push/pop do not need wide to be 64-bit +void XEmitter::PUSH(X64Reg reg) +{ + WriteSimple1Byte(32, 0x50, reg); +} +void XEmitter::POP(X64Reg reg) +{ + WriteSimple1Byte(32, 0x58, reg); +} + +void XEmitter::PUSH(int bits, const OpArg& reg) +{ + if (reg.IsSimpleReg()) + PUSH(reg.GetSimpleReg()); + else if (reg.IsImm()) + { + switch (reg.GetImmBits()) + { + case 8: + Write8(0x6A); + Write8((u8)(s8)reg.offset); + break; + case 16: + Write8(0x66); + Write8(0x68); + Write16((u16)(s16)(s32)reg.offset); + break; + case 32: + Write8(0x68); + Write32((u32)reg.offset); + break; + default: + ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits"); + break; + } + } + else + { + if (bits == 16) + Write8(0x66); + reg.WriteREX(this, bits, bits); + Write8(0xFF); + reg.WriteRest(this, 0, (X64Reg)6); + } +} + +void XEmitter::POP(int /*bits*/, const OpArg& reg) +{ + if (reg.IsSimpleReg()) + POP(reg.GetSimpleReg()); + else + ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding"); +} + +void XEmitter::BSWAP(int bits, X64Reg reg) +{ + if (bits >= 32) + { + WriteSimple2Byte(bits, 0x0F, 0xC8, reg); + } + else if (bits == 16) + { + ROL(16, R(reg), Imm8(8)); + } + else if (bits == 8) + { + // Do nothing - can't bswap a single byte... + } + else + { + ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits"); + } +} + +// Undefined opcode - reserved +// If we ever need a way to always cause a non-breakpoint hard exception... +void XEmitter::UD2() +{ + Write8(0x0F); + Write8(0x0B); +} + +void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg) +{ + ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument"); + arg.operandReg = (u8)level; + arg.WriteREX(this, 0, 0); + Write8(0x0F); + Write8(0x18); + arg.WriteRest(this); +} + +void XEmitter::SETcc(CCFlags flag, OpArg dest) +{ + ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument"); + dest.operandReg = 0; + dest.WriteREX(this, 0, 8); + Write8(0x0F); + Write8(0x90 + (u8)flag); + dest.WriteRest(this); +} + +void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument"); + ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported"); + if (bits == 16) + Write8(0x66); + src.operandReg = dest; + src.WriteREX(this, bits, bits); + Write8(0x0F); + Write8(0x40 + (u8)flag); + src.WriteRest(this); +} + +void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument"); + CheckFlags(); + src.operandReg = ext; + if (bits == 16) + Write8(0x66); + src.WriteREX(this, bits, bits, 0); + if (bits == 8) + { + Write8(0xF6); + } + else + { + Write8(0xF7); + } + src.WriteRest(this); +} + +void XEmitter::MUL(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 4); +} +void XEmitter::DIV(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 6); +} +void XEmitter::IMUL(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 5); +} +void XEmitter::IDIV(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 7); +} +void XEmitter::NEG(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 3); +} +void XEmitter::NOT(int bits, const OpArg& src) +{ + WriteMulDivType(bits, src, 2); +} + +void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument"); + CheckFlags(); + src.operandReg = (u8)dest; + if (bits == 16) + Write8(0x66); + if (rep) + Write8(0xF3); + src.WriteREX(this, bits, bits); + Write8(0x0F); + Write8(byte2); + src.WriteRest(this); +} + +void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src) +{ + if (bits <= 16) + ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16"); + WriteBitSearchType(bits, src, dest, 0xC3); +} + +void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src) +{ + WriteBitSearchType(bits, dest, src, 0xBC); +} // Bottom bit to top bit +void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src) +{ + WriteBitSearchType(bits, dest, src, 0xBD); +} // Top bit to bottom bit + +void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src) +{ + CheckFlags(); + if (!cpu_info.bBMI1) + PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer."); + WriteBitSearchType(bits, dest, src, 0xBC, true); +} +void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src) +{ + CheckFlags(); + if (!cpu_info.bLZCNT) + PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer."); + WriteBitSearchType(bits, dest, src, 0xBD, true); +} + +void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument"); + if (dbits == sbits) + { + MOV(dbits, R(dest), src); + return; + } + src.operandReg = (u8)dest; + if (dbits == 16) + Write8(0x66); + src.WriteREX(this, dbits, sbits); + if (sbits == 8) + { + Write8(0x0F); + Write8(0xBE); + } + else if (sbits == 16) + { + Write8(0x0F); + Write8(0xBF); + } + else if (sbits == 32 && dbits == 64) + { + Write8(0x63); + } + else + { + Crash(); + } + src.WriteRest(this); +} + +void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument"); + if (dbits == sbits) + { + MOV(dbits, R(dest), src); + return; + } + src.operandReg = (u8)dest; + if (dbits == 16) + Write8(0x66); + // the 32bit result is automatically zero extended to 64bit + src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits); + if (sbits == 8) + { + Write8(0x0F); + Write8(0xB6); + } + else if (sbits == 16) + { + Write8(0x0F); + Write8(0xB7); + } + else if (sbits == 32 && dbits == 64) + { + Write8(0x8B); + } + else + { + ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size"); + } + src.WriteRest(this); +} + +void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg) +{ + ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it."); + if (bits == 8) + { + MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg); + return; + } + if (bits == 16) + Write8(0x66); + ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!"); + arg.WriteREX(this, bits, bits, reg); + Write8(0x0F); + Write8(0x38); + Write8(op); + arg.WriteRest(this, 0, reg); +} +void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src) +{ + WriteMOVBE(bits, 0xF0, dest, src); +} +void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src) +{ + WriteMOVBE(bits, 0xF1, src, dest); +} + +void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info) +{ + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = false; + } + + switch (size) + { + case 8: + if (sign_extend) + MOVSX(32, 8, dst, src); + else + MOVZX(32, 8, dst, src); + break; + case 16: + MOVZX(32, 16, dst, src); + if (sign_extend) + { + BSWAP(32, dst); + SAR(32, R(dst), Imm8(16)); + } + else + { + ROL(16, R(dst), Imm8(8)); + } + break; + case 32: + case 64: + if (cpu_info.bMOVBE) + { + MOVBE(size, dst, src); + } + else + { + MOV(size, R(dst), src); + BSWAP(size, dst); + } + break; + } +} + +void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info) +{ + if (cpu_info.bMOVBE) + { + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = false; + } + MOVBE(size, dst, src); + } + else + { + BSWAP(size, src); + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = true; + info->nonAtomicSwapStoreSrc = src; + } + MOV(size, dst, R(src)); + } +} + +void XEmitter::LEA(int bits, X64Reg dest, OpArg src) +{ + ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument"); + src.operandReg = (u8)dest; + if (bits == 16) + Write8(0x66); // TODO: performance warning + src.WriteREX(this, bits, bits); + Write8(0x8D); + src.WriteRest(this, 0, INVALID_REG, bits == 64); +} + +// shift can be either imm8 or cl +void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext) +{ + CheckFlags(); + bool writeImm = false; + if (dest.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || + (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument"); + } + dest.operandReg = ext; + if (bits == 16) + Write8(0x66); + dest.WriteREX(this, bits, bits, 0); + if (shift.GetImmBits() == 8) + { + // ok an imm + u8 imm = (u8)shift.offset; + if (imm == 1) + { + Write8(bits == 8 ? 0xD0 : 0xD1); + } + else + { + writeImm = true; + Write8(bits == 8 ? 0xC0 : 0xC1); + } + } + else + { + Write8(bits == 8 ? 0xD2 : 0xD3); + } + dest.WriteRest(this, writeImm ? 1 : 0); + if (writeImm) + Write8((u8)shift.offset); +} + +// large rotates and shift are slower on Intel than AMD +// Intel likes to rotate by 1, and the op is smaller too +void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 0); +} +void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 1); +} +void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 2); +} +void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 3); +} +void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 4); +} +void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 5); +} +void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift) +{ + WriteShift(bits, dest, shift, 7); +} + +// index can be either imm8 or register, don't use memory destination because it's slow +void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms"); + } + if ((index.IsImm() && index.GetImmBits() != 8)) + { + ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument"); + } + if (bits == 16) + Write8(0x66); + if (index.IsImm()) + { + dest.WriteREX(this, bits, bits); + Write8(0x0F); + Write8(0xBA); + dest.WriteRest(this, 1, (X64Reg)ext); + Write8((u8)index.offset); + } + else + { + X64Reg operand = index.GetSimpleReg(); + dest.WriteREX(this, bits, bits, operand); + Write8(0x0F); + Write8(0x83 + 8 * ext); + dest.WriteRest(this, 1, operand); + } +} + +void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index) +{ + WriteBitTest(bits, dest, index, 4); +} +void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index) +{ + WriteBitTest(bits, dest, index, 5); +} +void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index) +{ + WriteBitTest(bits, dest, index, 6); +} +void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index) +{ + WriteBitTest(bits, dest, index, 7); +} + +// shift can be either imm8 or cl +void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination"); + } + if (!src.IsSimpleReg()) + { + ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || + (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift"); + } + if (bits == 16) + Write8(0x66); + X64Reg operand = src.GetSimpleReg(); + dest.WriteREX(this, bits, bits, operand); + if (shift.GetImmBits() == 8) + { + Write8(0x0F); + Write8(0xAC); + dest.WriteRest(this, 1, operand); + Write8((u8)shift.offset); + } + else + { + Write8(0x0F); + Write8(0xAD); + dest.WriteRest(this, 0, operand); + } +} + +void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift) +{ + CheckFlags(); + if (dest.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination"); + } + if (!src.IsSimpleReg()) + { + ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source"); + } + if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || + (shift.IsImm() && shift.GetImmBits() != 8)) + { + ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift"); + } + if (bits == 16) + Write8(0x66); + X64Reg operand = src.GetSimpleReg(); + dest.WriteREX(this, bits, bits, operand); + if (shift.GetImmBits() == 8) + { + Write8(0x0F); + Write8(0xA4); + dest.WriteRest(this, 1, operand); + Write8((u8)shift.offset); + } + else + { + Write8(0x0F); + Write8(0xA5); + dest.WriteRest(this, 0, operand); + } +} + +void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits) +{ + if (bits == 16) + emit->Write8(0x66); + + this->operandReg = (u8)_operandReg; + WriteREX(emit, bits, bits); + emit->Write8(op); + WriteRest(emit); +} + +// operand can either be immediate or register +void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, + int bits) const +{ + X64Reg _operandReg; + if (IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order"); + } + + if (bits == 16) + emit->Write8(0x66); + + int immToWrite = 0; + const NormalOpDef& op_def = normalops[static_cast<int>(op)]; + + if (operand.IsImm()) + { + WriteREX(emit, bits, bits); + + if (!toRM) + { + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)"); + } + + if (operand.scale == SCALE_IMM8 && bits == 8) + { + // op al, imm8 + if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC) + { + emit->Write8(op_def.eaximm8); + emit->Write8((u8)operand.offset); + return; + } + // mov reg, imm8 + if (!scale && op == NormalOp::MOV) + { + emit->Write8(0xB0 + (offsetOrBaseReg & 7)); + emit->Write8((u8)operand.offset); + return; + } + // op r/m8, imm8 + emit->Write8(op_def.imm8); + immToWrite = 8; + } + else if ((operand.scale == SCALE_IMM16 && bits == 16) || + (operand.scale == SCALE_IMM32 && bits == 32) || + (operand.scale == SCALE_IMM32 && bits == 64)) + { + // Try to save immediate size if we can, but first check to see + // if the instruction supports simm8. + // op r/m, imm8 + if (op_def.simm8 != 0xCC && + ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) || + (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset))) + { + emit->Write8(op_def.simm8); + immToWrite = 8; + } + else + { + // mov reg, imm + if (!scale && op == NormalOp::MOV && bits != 64) + { + emit->Write8(0xB8 + (offsetOrBaseReg & 7)); + if (bits == 16) + emit->Write16((u16)operand.offset); + else + emit->Write32((u32)operand.offset); + return; + } + // op eax, imm + if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC) + { + emit->Write8(op_def.eaximm32); + if (bits == 16) + emit->Write16((u16)operand.offset); + else + emit->Write32((u32)operand.offset); + return; + } + // op r/m, imm + emit->Write8(op_def.imm32); + immToWrite = bits == 16 ? 16 : 32; + } + } + else if ((operand.scale == SCALE_IMM8 && bits == 16) || + (operand.scale == SCALE_IMM8 && bits == 32) || + (operand.scale == SCALE_IMM8 && bits == 64)) + { + // op r/m, imm8 + emit->Write8(op_def.simm8); + immToWrite = 8; + } + else if (operand.scale == SCALE_IMM64 && bits == 64) + { + if (scale) + { + ASSERT_MSG(DYNA_REC, 0, + "WriteNormalOp - MOV with 64-bit imm requires register destination"); + } + // mov reg64, imm64 + else if (op == NormalOp::MOV) + { + // movabs reg64, imm64 (10 bytes) + if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset)) + { + emit->Write8(0xB8 + (offsetOrBaseReg & 7)); + emit->Write64(operand.offset); + return; + } + // mov reg64, simm32 (7 bytes) + emit->Write8(op_def.imm32); + immToWrite = 32; + } + else + { + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm"); + } + } + else + { + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits); + } + + // pass extension in REG of ModRM + _operandReg = static_cast<X64Reg>(op_def.ext); + } + else + { + _operandReg = (X64Reg)operand.offsetOrBaseReg; + WriteREX(emit, bits, bits, _operandReg); + // op r/m, reg + if (toRM) + { + emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32); + } + // op reg, r/m + else + { + emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32); + } + } + WriteRest(emit, immToWrite >> 3, _operandReg); + switch (immToWrite) + { + case 0: + break; + case 8: + emit->Write8((u8)operand.offset); + break; + case 16: + emit->Write16((u16)operand.offset); + break; + case 32: + emit->Write32((u32)operand.offset); + break; + default: + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case"); + } +} + +void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2) +{ + if (a1.IsImm()) + { + // Booh! Can't write to an imm + ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm"); + return; + } + if (a2.IsImm()) + { + a1.WriteNormalOp(this, true, op, a2, bits); + } + else + { + if (a1.IsSimpleReg()) + { + a2.WriteNormalOp(this, false, op, a1, bits); + } + else + { + ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(), + "WriteNormalOp - a1 and a2 cannot both be memory"); + a1.WriteNormalOp(this, true, op, a2, bits); + } + } +} + +void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::ADD, a1, a2); +} +void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::ADC, a1, a2); +} +void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::SUB, a1, a2); +} +void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::SBB, a1, a2); +} +void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::AND, a1, a2); +} +void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::OR, a1, a2); +} +void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::XOR, a1, a2); +} +void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2) +{ + if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 && + a2.offset == static_cast<u32>(a2.offset)) + { + WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32()); + return; + } + if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg()) + { + ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code); + } + WriteNormalOp(bits, NormalOp::MOV, a1, a2); +} +void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::TEST, a1, a2); +} +void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + WriteNormalOp(bits, NormalOp::CMP, a1, a2); +} +void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2) +{ + WriteNormalOp(bits, NormalOp::XCHG, a1, a2); +} +void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + if (a1.IsSimpleReg() && a2.IsZero()) // turn 'CMP reg, 0' into shorter 'TEST reg, reg' + { + WriteNormalOp(bits, NormalOp::TEST, a1, a1); + } + else + { + WriteNormalOp(bits, NormalOp::CMP, a1, a2); + } +} + +void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2) +{ + // This stomps on flags, so ensure they aren't locked + DEBUG_ASSERT(!flags_locked); + + // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero + // or a2 == dest && a1 == zero) + if (a1.IsZero()) + { + if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest) + { + MOV(bits, R(dest), a2); + } + return; + } + if (a2.IsZero()) + { + if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest) + { + MOV(bits, R(dest), a1); + } + return; + } + + // If dest == a1 or dest == a2 we can simplify this + if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest) + { + ADD(bits, R(dest), a2); + return; + } + + if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest) + { + ADD(bits, R(dest), a1); + return; + } + + // TODO: 32-bit optimizations may apply to other bit sizes (confirm) + if (bits == 32) + { + if (a1.IsImm() && a2.IsImm()) + { + MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32())); + return; + } + + if (a1.IsSimpleReg() && a2.IsSimpleReg()) + { + LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg())); + return; + } + + if (a1.IsSimpleReg() && a2.IsImm()) + { + LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32())); + return; + } + + if (a1.IsImm() && a2.IsSimpleReg()) + { + LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32())); + return; + } + } + + // Fallback + MOV(bits, R(dest), a1); + ADD(bits, R(dest), a2); +} + +void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2) +{ + CheckFlags(); + if (bits == 8) + { + ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!"); + return; + } + + if (a1.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!"); + return; + } + + if (!a2.IsImm()) + { + ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!"); + return; + } + + if (bits == 16) + Write8(0x66); + a1.WriteREX(this, bits, bits, regOp); + + if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) || + (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset)) + { + Write8(0x6B); + a1.WriteRest(this, 1, regOp); + Write8((u8)a2.offset); + } + else + { + Write8(0x69); + if (a2.GetImmBits() == 16 && bits == 16) + { + a1.WriteRest(this, 2, regOp); + Write16((u16)a2.offset); + } + else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64)) + { + a1.WriteRest(this, 4, regOp); + Write32((u32)a2.offset); + } + else + { + ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!"); + } + } +} + +void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a) +{ + CheckFlags(); + if (bits == 8) + { + ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!"); + return; + } + + if (a.IsImm()) + { + IMUL(bits, regOp, R(regOp), a); + return; + } + + if (bits == 16) + Write8(0x66); + a.WriteREX(this, bits, bits, regOp); + Write8(0x0F); + Write8(0xAF); + a.WriteRest(this, 0, regOp); +} + +void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ + if (opPrefix) + Write8(opPrefix); + arg.operandReg = regOp; + arg.WriteREX(this, 0, 0); + Write8(0x0F); + if (op > 0xFF) + Write8((op >> 8) & 0xFF); + Write8(op & 0xFF); + arg.WriteRest(this, extrabytes); +} + +static int GetVEXmmmmm(u16 op) +{ + // Currently, only 0x38 and 0x3A are used as secondary escape byte. + if ((op >> 8) == 0x3A) + return 3; + else if ((op >> 8) == 0x38) + return 2; + else + return 1; +} + +static int GetVEXpp(u8 opPrefix) +{ + if (opPrefix == 0x66) + return 1; + else if (opPrefix == 0xF3) + return 2; + else if (opPrefix == 0xF2) + return 3; + else + return 0; +} + +void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int W, int extrabytes) +{ + int mmmmm = GetVEXmmmmm(op); + int pp = GetVEXpp(opPrefix); + // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here + arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W); + Write8(op & 0xFF); + arg.WriteRest(this, extrabytes, regOp1); +} + +void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + X64Reg regOp3, int W) +{ + WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1); + Write8((u8)regOp3 << 4); +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int W, int extrabytes) +{ + if (!cpu_info.bAVX) + PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer."); + WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes); +} + +void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + X64Reg regOp3, int W) +{ + if (!cpu_info.bAVX) + PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer."); + WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W); +} + +void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W) +{ + if (!cpu_info.bFMA) + PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd."); + WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W); +} + +void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int W) +{ + if (!cpu_info.bFMA4) + PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd."); + WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W); +} + +void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, + const OpArg& arg, int extrabytes) +{ + if (arg.IsImm()) + PanicAlert("BMI1/2 instructions don't support immediate operands."); + if (size != 32 && size != 64) + PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!"); + int W = size == 64; + WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes); +} + +void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, + const OpArg& arg, int extrabytes) +{ + CheckFlags(); + if (!cpu_info.bBMI1) + PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer."); + WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, + const OpArg& arg, int extrabytes) +{ + if (!cpu_info.bBMI2) + PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer."); + WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x6E, dest, arg, 0); +} +void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src) +{ + WriteSSEOp(0x66, 0x7E, src, arg, 0); +} + +void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) +{ + // Alternate encoding + // This does not display correctly in MSVC's debugger, it thinks it's a MOVD + arg.operandReg = dest; + Write8(0x66); + arg.WriteREX(this, 64, 0); + Write8(0x0f); + Write8(0x6E); + arg.WriteRest(this, 0); +} + +void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) +{ + if (src > 7 || arg.IsSimpleReg()) + { + // Alternate encoding + // This does not display correctly in MSVC's debugger, it thinks it's a MOVD + arg.operandReg = src; + Write8(0x66); + arg.WriteREX(this, 64, 0); + Write8(0x0f); + Write8(0x7E); + arg.WriteRest(this, 0); + } + else + { + arg.operandReg = src; + arg.WriteREX(this, 0, 0); + Write8(0x66); + Write8(0x0f); + Write8(0xD6); + arg.WriteRest(this, 0); + } +} + +void XEmitter::WriteMXCSR(OpArg arg, int ext) +{ + if (arg.IsImm() || arg.IsSimpleReg()) + ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand"); + + arg.operandReg = ext; + arg.WriteREX(this, 0, 0); + Write8(0x0F); + Write8(0xAE); + arg.WriteRest(this); +} + +void XEmitter::STMXCSR(const OpArg& memloc) +{ + WriteMXCSR(memloc, 3); +} +void XEmitter::LDMXCSR(const OpArg& memloc) +{ + WriteMXCSR(memloc, 2); +} + +void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg); +} +void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x00, sseMOVNTP, regOp, arg); +} +void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVNTP, regOp, arg); +} + +void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseADD, regOp, arg); +} +void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseADD, regOp, arg); +} +void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseSUB, regOp, arg); +} +void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseSUB, regOp, arg); +} +void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare) +{ + WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); + Write8(compare); +} +void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare) +{ + WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); + Write8(compare); +} +void XEmitter::MULSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseMUL, regOp, arg); +} +void XEmitter::MULSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseMUL, regOp, arg); +} +void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseDIV, regOp, arg); +} +void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseDIV, regOp, arg); +} +void XEmitter::MINSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseMIN, regOp, arg); +} +void XEmitter::MINSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseMIN, regOp, arg); +} +void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseMAX, regOp, arg); +} +void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseMAX, regOp, arg); +} +void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseSQRT, regOp, arg); +} +void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseSQRT, regOp, arg); +} +void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseRCP, regOp, arg); +} +void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseRSQRT, regOp, arg); +} + +void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseADD, regOp, arg); +} +void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseADD, regOp, arg); +} +void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseSUB, regOp, arg); +} +void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseSUB, regOp, arg); +} +void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare) +{ + WriteSSEOp(0x00, sseCMP, regOp, arg, 1); + Write8(compare); +} +void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare) +{ + WriteSSEOp(0x66, sseCMP, regOp, arg, 1); + Write8(compare); +} +void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseAND, regOp, arg); +} +void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseAND, regOp, arg); +} +void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseANDN, regOp, arg); +} +void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseANDN, regOp, arg); +} +void XEmitter::ORPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseOR, regOp, arg); +} +void XEmitter::ORPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseOR, regOp, arg); +} +void XEmitter::XORPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseXOR, regOp, arg); +} +void XEmitter::XORPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseXOR, regOp, arg); +} +void XEmitter::MULPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMUL, regOp, arg); +} +void XEmitter::MULPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMUL, regOp, arg); +} +void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseDIV, regOp, arg); +} +void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseDIV, regOp, arg); +} +void XEmitter::MINPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMIN, regOp, arg); +} +void XEmitter::MINPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMIN, regOp, arg); +} +void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMAX, regOp, arg); +} +void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMAX, regOp, arg); +} +void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseSQRT, regOp, arg); +} +void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseSQRT, regOp, arg); +} +void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseRCP, regOp, arg); +} +void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseRSQRT, regOp, arg); +} +void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle) +{ + WriteSSEOp(0x00, sseSHUF, regOp, arg, 1); + Write8(shuffle); +} +void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle) +{ + WriteSSEOp(0x66, sseSHUF, regOp, arg, 1); + Write8(shuffle); +} + +void XEmitter::COMISS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseCOMIS, regOp, arg); +} // weird that these should be packed +void XEmitter::COMISD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseCOMIS, regOp, arg); +} // ordered +void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseUCOMIS, regOp, arg); +} // unordered +void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseUCOMIS, regOp, arg); +} + +void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg); +} +void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg); +} +void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg); +} +void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg); +} + +void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg); +} +void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg); +} +void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg); +} +void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg); +} + +void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg); +} +void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg); +} +void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg); +} +void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg); +} + +void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg); +} +void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg); +} +void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg); +} +void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg); +} + +void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); +} +void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); +} +void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); +} +void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); +} + +void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); +} +void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); +} +void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); +} +void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp) +{ + WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); +} + +void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) +{ + WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2)); +} +void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) +{ + WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2)); +} + +void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, 0x5A, regOp, arg); +} +void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x5A, regOp, arg); +} + +void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, 0x5A, regOp, arg); +} +void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0x5A, regOp, arg); +} +void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, 0x2D, regOp, arg); +} +void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0x2D, regOp, arg); +} +void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, 0x2A, regOp, arg); +} +void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0x2A, regOp, arg); +} + +void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0xE6, regOp, arg); +} +void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x00, 0x5B, regOp, arg); +} +void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, 0xE6, regOp, arg); +} +void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x5B, regOp, arg); +} + +void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF2, 0x2C, regOp, arg); +} +void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0x2C, regOp, arg); +} +void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0xF3, 0x5B, regOp, arg); +} +void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xE6, regOp, arg); +} + +void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) +{ + WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src)); +} + +void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x00, 0x50, dest, arg); +} +void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x50, dest, arg); +} + +void XEmitter::LDDQU(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0xF2, sseLDDQU, dest, arg); +} // For integer data only + +void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x00, 0x14, dest, arg); +} +void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x00, 0x15, dest, arg); +} +void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x14, dest, arg); +} +void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x15, dest, arg); +} + +// Pretty much every x86 CPU nowadays supports SSE3, +// but the SSE2 fallbacks are easy. +void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg) +{ + if (cpu_info.bSSE3) + { + WriteSSEOp(0xF3, 0x12, regOp, arg); + } + else + { + if (!arg.IsSimpleReg(regOp)) + MOVAPD(regOp, arg); + UNPCKLPS(regOp, R(regOp)); + } +} +void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg) +{ + if (cpu_info.bSSE3) + { + WriteSSEOp(0xF3, 0x16, regOp, arg); + } + else + { + if (!arg.IsSimpleReg(regOp)) + MOVAPD(regOp, arg); + UNPCKHPS(regOp, R(regOp)); + } +} +void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg) +{ + if (cpu_info.bSSE3) + { + WriteSSEOp(0xF2, 0x12, regOp, arg); + } + else + { + if (!arg.IsSimpleReg(regOp)) + MOVSD(regOp, arg); + UNPCKLPD(regOp, R(regOp)); + } +} + +// There are a few more left + +// Also some integer instructions are missing +void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x6B, dest, arg); +} +void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x63, dest, arg); +} +void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x67, dest, arg); +} + +void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x60, dest, arg); +} +void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x61, dest, arg); +} +void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x62, dest, arg); +} +void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x6C, dest, arg); +} + +void XEmitter::PSRLW(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLD(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); + Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xd3, reg, arg); +} + +void XEmitter::PSRLDQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLW(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLD(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); + Write8(shift); +} + +void XEmitter::PSLLDQ(X64Reg reg, int shift) +{ + WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg)); + Write8(shift); +} + +// WARNING not REX compatible +void XEmitter::PSRAW(X64Reg reg, int shift) +{ + if (reg > 7) + PanicAlert("The PSRAW-emitter does not support regs above 7"); + Write8(0x66); + Write8(0x0f); + Write8(0x71); + Write8(0xE0 | reg); + Write8(shift); +} + +// WARNING not REX compatible +void XEmitter::PSRAD(X64Reg reg, int shift) +{ + if (reg > 7) + PanicAlert("The PSRAD-emitter does not support regs above 7"); + Write8(0x66); + Write8(0x0f); + Write8(0x72); + Write8(0xE0 | reg); + Write8(shift); +} + +void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes) +{ + if (!cpu_info.bSSSE3) + PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes) +{ + if (!cpu_info.bSSE4_1) + PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); + WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg) +{ + WriteSSSE3Op(0x66, 0x3800, dest, arg); +} +void XEmitter::PTEST(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3817, dest, arg); +} +void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x382b, dest, arg); +} + +void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3820, dest, arg); +} +void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3821, dest, arg); +} +void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3822, dest, arg); +} +void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3823, dest, arg); +} +void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3824, dest, arg); +} +void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3825, dest, arg); +} +void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3830, dest, arg); +} +void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3831, dest, arg); +} +void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3832, dest, arg); +} +void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3833, dest, arg); +} +void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3834, dest, arg); +} +void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3835, dest, arg); +} + +void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3810, dest, arg); +} +void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3814, dest, arg); +} +void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3815, dest, arg); +} +void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) +{ + WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); + Write8(blend); +} +void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) +{ + WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); + Write8(blend); +} + +void XEmitter::PAND(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDB, dest, arg); +} +void XEmitter::PANDN(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDF, dest, arg); +} +void XEmitter::PXOR(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xEF, dest, arg); +} +void XEmitter::POR(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xEB, dest, arg); +} + +void XEmitter::PADDB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xFC, dest, arg); +} +void XEmitter::PADDW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xFD, dest, arg); +} +void XEmitter::PADDD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xFE, dest, arg); +} +void XEmitter::PADDQ(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xD4, dest, arg); +} + +void XEmitter::PADDSB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xEC, dest, arg); +} +void XEmitter::PADDSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xED, dest, arg); +} +void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDC, dest, arg); +} +void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDD, dest, arg); +} + +void XEmitter::PSUBB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xF8, dest, arg); +} +void XEmitter::PSUBW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xF9, dest, arg); +} +void XEmitter::PSUBD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xFA, dest, arg); +} +void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xFB, dest, arg); +} + +void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xE8, dest, arg); +} +void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xE9, dest, arg); +} +void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xD8, dest, arg); +} +void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xD9, dest, arg); +} + +void XEmitter::PAVGB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xE0, dest, arg); +} +void XEmitter::PAVGW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xE3, dest, arg); +} + +void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x74, dest, arg); +} +void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x75, dest, arg); +} +void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x76, dest, arg); +} + +void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x64, dest, arg); +} +void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x65, dest, arg); +} +void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0x66, dest, arg); +} + +void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg) +{ + WriteSSEOp(0x66, 0xC5, dest, arg); + Write8(subreg); +} +void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg) +{ + WriteSSEOp(0x66, 0xC4, dest, arg); + Write8(subreg); +} +void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg) +{ + WriteSSE41Op(0x66, 0x3A22, dest, arg); + Write8(subreg); +} + +void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xF5, dest, arg); +} +void XEmitter::PSADBW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xF6, dest, arg); +} + +void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xEE, dest, arg); +} +void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDE, dest, arg); +} +void XEmitter::PMINSW(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xEA, dest, arg); +} +void XEmitter::PMINUB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xDA, dest, arg); +} + +void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg) +{ + WriteSSEOp(0x66, 0xD7, dest, arg); +} +void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle) +{ + WriteSSEOp(0x66, 0x70, regOp, arg, 1); + Write8(shuffle); +} +void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle) +{ + WriteSSEOp(0xF2, 0x70, regOp, arg, 1); + Write8(shuffle); +} +void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle) +{ + WriteSSEOp(0xF3, 0x70, regOp, arg, 1); + Write8(shuffle); +} + +// VEX +void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg); +} +void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg); +} +void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg); +} +void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg); +} +void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg); +} +void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg); +} +void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg); +} +void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg); +} +void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg); +} +void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg); +} +void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg); +} +void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg); +} +void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg); +} +void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg); +} +void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg); +} +void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg); +} +void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg); +} +void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare) +{ + WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1); + Write8(compare); +} +void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle) +{ + WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1); + Write8(shuffle); +} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle) +{ + WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1); + Write8(shuffle); +} +void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg); +} +void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg); +} +void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg); +} +void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3) +{ + WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3); +} +void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend) +{ + WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1); + Write8(blend); +} +void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend) +{ + WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1); + Write8(blend); +} + +void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); +} +void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); +} +void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); +} +void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); +} +void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); +} +void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); +} +void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); +} +void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); +} + +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); +} +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); +} +void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); +} +void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); +} + +void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x98, regOp1, regOp2, arg); +} +void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA8, regOp1, regOp2, arg); +} +void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB8, regOp1, regOp2, arg); +} +void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x98, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x99, regOp1, regOp2, arg); +} +void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA9, regOp1, regOp2, arg); +} +void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB9, regOp1, regOp2, arg); +} +void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x99, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9A, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAA, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBA, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9B, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAB, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBB, regOp1, regOp2, arg); +} +void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9C, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAC, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBC, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9D, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAD, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBD, regOp1, regOp2, arg); +} +void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9E, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAE, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBE, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9F, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAF, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBF, regOp1, regOp2, arg); +} +void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1); +} +void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x96, regOp1, regOp2, arg); +} +void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA6, regOp1, regOp2, arg); +} +void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB6, regOp1, regOp2, arg); +} +void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x96, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x97, regOp1, regOp2, arg); +} +void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA7, regOp1, regOp2, arg); +} +void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB7, regOp1, regOp2, arg); +} +void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0x97, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1); +} +void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1); +} + +#define FMA4(name, op) \ + void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) \ + { \ + WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1); \ + } \ + void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) \ + { \ + WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0); \ + } + +FMA4(VFMADDSUBPS, 0x5C) +FMA4(VFMADDSUBPD, 0x5D) +FMA4(VFMSUBADDPS, 0x5E) +FMA4(VFMSUBADDPD, 0x5F) +FMA4(VFMADDPS, 0x68) +FMA4(VFMADDPD, 0x69) +FMA4(VFMADDSS, 0x6A) +FMA4(VFMADDSD, 0x6B) +FMA4(VFMSUBPS, 0x6C) +FMA4(VFMSUBPD, 0x6D) +FMA4(VFMSUBSS, 0x6E) +FMA4(VFMSUBSD, 0x6F) +FMA4(VFNMADDPS, 0x78) +FMA4(VFNMADDPD, 0x79) +FMA4(VFNMADDSS, 0x7A) +FMA4(VFNMADDSD, 0x7B) +FMA4(VFNMSUBPS, 0x7C) +FMA4(VFNMSUBPD, 0x7D) +FMA4(VFNMSUBSS, 0x7E) +FMA4(VFNMSUBSD, 0x7F) +#undef FMA4 + +void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) +{ + WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg); +} +void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) +{ + WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg); +} +void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) +{ + WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg); +} +void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate) +{ + WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); + Write8(rotate); +} +void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg); +} +void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg); +} +void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg); +} +void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) +{ + CheckFlags(); + WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg); +} +void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg) +{ + WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg); +} +void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg) +{ + WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg); +} +void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg) +{ + WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg); +} +void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) +{ + WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg); +} +void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) +{ + WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg); +} + +// Prefixes + +void XEmitter::LOCK() +{ + Write8(0xF0); +} +void XEmitter::REP() +{ + Write8(0xF3); +} +void XEmitter::REPNE() +{ + Write8(0xF2); +} +void XEmitter::FSOverride() +{ + Write8(0x64); +} +void XEmitter::GSOverride() +{ + Write8(0x65); +} + +void XEmitter::FWAIT() +{ + Write8(0x9B); +} + +// TODO: make this more generic +void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg) +{ + int mf = 0; + ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid), + "WriteFloatLoadStore: 80 bits not supported for this instruction"); + switch (bits) + { + case 32: + mf = 0; + break; + case 64: + mf = 4; + break; + case 80: + mf = 2; + break; + default: + ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)"); + } + Write8(0xd9 | mf); + // x87 instructions use the reg field of the ModR/M byte as opcode: + if (bits == 80) + op = op_80b; + arg.WriteRest(this, 0, static_cast<X64Reg>(op)); +} + +void XEmitter::FLD(int bits, const OpArg& src) +{ + WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src); +} +void XEmitter::FST(int bits, const OpArg& dest) +{ + WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest); +} +void XEmitter::FSTP(int bits, const OpArg& dest) +{ + WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest); +} +void XEmitter::FNSTSW_AX() +{ + Write8(0xDF); + Write8(0xE0); +} + +void XEmitter::RDTSC() +{ + Write8(0x0F); + Write8(0x31); +} +} diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h new file mode 100644 index 0000000..869acb6 --- /dev/null +++ b/src/dolphin/x64Emitter.h @@ -0,0 +1,1169 @@ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!! + +#pragma once + +#include <cstddef> +#include <cstring> +#include <functional> +#include <tuple> +#include <type_traits> + +#include "Compat.h" +#include "BitSet.h" +#include "../types.h" +#include "x64ABI.h" + +namespace Gen +{ +enum CCFlags +{ + CC_O = 0, + CC_NO = 1, + CC_B = 2, + CC_C = 2, + CC_NAE = 2, + CC_NB = 3, + CC_NC = 3, + CC_AE = 3, + CC_Z = 4, + CC_E = 4, + CC_NZ = 5, + CC_NE = 5, + CC_BE = 6, + CC_NA = 6, + CC_NBE = 7, + CC_A = 7, + CC_S = 8, + CC_NS = 9, + CC_P = 0xA, + CC_PE = 0xA, + CC_NP = 0xB, + CC_PO = 0xB, + CC_L = 0xC, + CC_NGE = 0xC, + CC_NL = 0xD, + CC_GE = 0xD, + CC_LE = 0xE, + CC_NG = 0xE, + CC_NLE = 0xF, + CC_G = 0xF +}; + +enum +{ + NUMGPRs = 16, + NUMXMMs = 16, +}; + +enum +{ + SCALE_NONE = 0, + SCALE_1 = 1, + SCALE_2 = 2, + SCALE_4 = 4, + SCALE_8 = 8, + SCALE_ATREG = 16, + // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG + SCALE_NOBASE_2 = 34, + SCALE_NOBASE_4 = 36, + SCALE_NOBASE_8 = 40, + SCALE_RIP = 0xFF, + SCALE_IMM8 = 0xF0, + SCALE_IMM16 = 0xF1, + SCALE_IMM32 = 0xF2, + SCALE_IMM64 = 0xF3, +}; + +enum SSECompare +{ + CMP_EQ = 0, + CMP_LT = 1, + CMP_LE = 2, + CMP_UNORD = 3, + CMP_NEQ = 4, + CMP_NLT = 5, + CMP_NLE = 6, + CMP_ORD = 7, +}; + +class XEmitter; +enum class FloatOp; +enum class NormalOp; + +// Information about a generated MOV op +struct MovInfo final +{ + u8* address; + bool nonAtomicSwapStore; + // valid iff nonAtomicSwapStore is true + X64Reg nonAtomicSwapStoreSrc; +}; + +// RIP addressing does not benefit from micro op fusion on Core arch +struct OpArg +{ + // For accessing offset and operandReg. + // This also allows us to keep the op writing functions private. + friend class XEmitter; + + // dummy op arg, used for storage + constexpr OpArg() = default; + constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX) + : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)}, + indexReg{static_cast<u16>(scaled_reg)}, offset{offset_} + { + } + constexpr bool operator==(const OpArg& b) const + { + // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately, + // (because we still support some older versions of GCC where std::tie is not constexpr.) + return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg && + indexReg == b.indexReg && offset == b.offset; + } + constexpr bool operator!=(const OpArg& b) const { return !operator==(b); } + u64 Imm64() const + { + DEBUG_ASSERT(scale == SCALE_IMM64); + return (u64)offset; + } + u32 Imm32() const + { + DEBUG_ASSERT(scale == SCALE_IMM32); + return (u32)offset; + } + u16 Imm16() const + { + DEBUG_ASSERT(scale == SCALE_IMM16); + return (u16)offset; + } + u8 Imm8() const + { + DEBUG_ASSERT(scale == SCALE_IMM8); + return (u8)offset; + } + + s64 SImm64() const + { + DEBUG_ASSERT(scale == SCALE_IMM64); + return (s64)offset; + } + s32 SImm32() const + { + DEBUG_ASSERT(scale == SCALE_IMM32); + return (s32)offset; + } + s16 SImm16() const + { + DEBUG_ASSERT(scale == SCALE_IMM16); + return (s16)offset; + } + s8 SImm8() const + { + DEBUG_ASSERT(scale == SCALE_IMM8); + return (s8)offset; + } + + OpArg AsImm64() const + { + DEBUG_ASSERT(IsImm()); + return OpArg((u64)offset, SCALE_IMM64); + } + OpArg AsImm32() const + { + DEBUG_ASSERT(IsImm()); + return OpArg((u32)offset, SCALE_IMM32); + } + OpArg AsImm16() const + { + DEBUG_ASSERT(IsImm()); + return OpArg((u16)offset, SCALE_IMM16); + } + OpArg AsImm8() const + { + DEBUG_ASSERT(IsImm()); + return OpArg((u8)offset, SCALE_IMM8); + } + + constexpr bool IsImm() const + { + return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || + scale == SCALE_IMM64; + } + constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; } + constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; } + constexpr bool IsZero() const { return IsImm() && offset == 0; } + constexpr int GetImmBits() const + { + switch (scale) + { + case SCALE_IMM8: + return 8; + case SCALE_IMM16: + return 16; + case SCALE_IMM32: + return 32; + case SCALE_IMM64: + return 64; + default: + return -1; + } + } + + constexpr X64Reg GetSimpleReg() const + { + if (scale == SCALE_NONE) + return static_cast<X64Reg>(offsetOrBaseReg); + + return INVALID_REG; + } + + void AddMemOffset(int val) + { + DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE), + "Tried to increment an OpArg which doesn't have an offset"); + offset += val; + } + +private: + void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const; + void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, + int W = 0) const; + void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG, + bool warn_64bit_offset = true) const; + void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits); + void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const; + + u8 scale = 0; + u16 offsetOrBaseReg = 0; + u16 indexReg = 0; + u64 offset = 0; // Also used to store immediates. + u16 operandReg = 0; +}; + +template <typename T> +inline OpArg M(const T* ptr) +{ + return OpArg((u64)(const void*)ptr, (int)SCALE_RIP); +} +constexpr OpArg R(X64Reg value) +{ + return OpArg(0, SCALE_NONE, value); +} +constexpr OpArg MatR(X64Reg value) +{ + return OpArg(0, SCALE_ATREG, value); +} + +constexpr OpArg MDisp(X64Reg value, int offset) +{ + return OpArg(static_cast<u32>(offset), SCALE_ATREG, value); +} + +constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) +{ + return OpArg(offset, scale, base, scaled); +} + +constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) +{ + if (scale == SCALE_1) + return OpArg(offset, SCALE_ATREG, scaled); + + return OpArg(offset, scale | 0x20, RAX, scaled); +} + +constexpr OpArg MRegSum(X64Reg base, X64Reg offset) +{ + return MComplex(base, offset, 1, 0); +} + +constexpr OpArg Imm8(u8 imm) +{ + return OpArg(imm, SCALE_IMM8); +} +constexpr OpArg Imm16(u16 imm) +{ + return OpArg(imm, SCALE_IMM16); +} // rarely used +constexpr OpArg Imm32(u32 imm) +{ + return OpArg(imm, SCALE_IMM32); +} +constexpr OpArg Imm64(u64 imm) +{ + return OpArg(imm, SCALE_IMM64); +} +inline OpArg ImmPtr(const void* imm) +{ + return Imm64(reinterpret_cast<u64>(imm)); +} + +inline u32 PtrOffset(const void* ptr, const void* base = nullptr) +{ + s64 distance = (s64)ptr - (s64)base; + if (distance >= 0x80000000LL || distance < -0x80000000LL) + { + ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range"); + return 0; + } + + return (u32)distance; +} + +// usage: int a[]; ARRAY_OFFSET(a,10) +#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0])) +// usage: struct {int e;} s; STRUCT_OFFSET(s,e) +#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str))) + +struct FixupBranch +{ + enum class Type + { + Branch8Bit, + Branch32Bit + }; + + u8* ptr; + Type type; +}; + +class XEmitter +{ + friend struct OpArg; // for Write8 etc +private: + u8* code = nullptr; + bool flags_locked = false; + + void CheckFlags(); + + void Rex(int w, int r, int x, int b); + void WriteModRM(int mod, int reg, int rm); + void WriteSIB(int scale, int index, int base); + void WriteSimple1Byte(int bits, u8 byte, X64Reg reg); + void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); + void WriteMulDivType(int bits, OpArg src, int ext); + void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false); + void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext); + void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext); + void WriteMXCSR(OpArg arg, int ext); + void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); + void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); + void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0, + int extrabytes = 0); + void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + X64Reg regOp3, int W = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0, + int extrabytes = 0); + void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + X64Reg regOp3, int W = 0); + void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0); + void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0); + void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int extrabytes = 0); + void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int extrabytes = 0); + void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, + int extrabytes = 0); + void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg); + void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); + void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2); + + void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, + size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + +protected: + void Write8(u8 value); + void Write16(u16 value); + void Write32(u32 value); + void Write64(u64 value); + +public: + XEmitter() = default; + explicit XEmitter(u8* code_ptr) : code{code_ptr} {} + virtual ~XEmitter() = default; + void SetCodePtr(u8* ptr); + void ReserveCodeSpace(int bytes); + u8* AlignCodeTo(size_t alignment); + u8* AlignCode4(); + u8* AlignCode16(); + u8* AlignCodePage(); + const u8* GetCodePtr() const; + u8* GetWritableCodePtr(); + + void LockFlags() { flags_locked = true; } + void UnlockFlags() { flags_locked = false; } + // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU + // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other + // string instr., + // INC and DEC are slow on Intel Core, but not on AMD. They create a + // false flag dependency because they only update a subset of the flags. + // XCHG is SLOW and should be avoided. + + // Debug breakpoint + void INT3(); + + // Do nothing + void NOP(size_t count = 1); + + // Save energy in wait-loops on P4 only. Probably not too useful. + void PAUSE(); + + // Flag control + void STC(); + void CLC(); + void CMC(); + + // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and + // AMD! + void LAHF(); // 3 cycle vector path + void SAHF(); // direct path fast + + // Stack control + void PUSH(X64Reg reg); + void POP(X64Reg reg); + void PUSH(int bits, const OpArg& reg); + void POP(int bits, const OpArg& reg); + void PUSHF(); + void POPF(); + + // Flow control + void RET(); + void RET_FAST(); + void UD2(); + FixupBranch J(bool force5bytes = false); + + void JMP(const u8* addr, bool force5Bytes = false); + void JMPptr(const OpArg& arg); + void JMPself(); // infinite loop! +#ifdef CALL +#undef CALL +#endif + void CALL(const void* fnptr); + FixupBranch CALL(); + void CALLptr(OpArg arg); + + FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); + void J_CC(CCFlags conditionCode, const u8* addr); + + void SetJumpTarget(const FixupBranch& branch); + + void SETcc(CCFlags flag, OpArg dest); + // Note: CMOV brings small if any benefit on current CPUs. + void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag); + + // Fences + void LFENCE(); + void MFENCE(); + void SFENCE(); + + // Bit scan + void BSF(int bits, X64Reg dest, const OpArg& src); // Bottom bit to top bit + void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit + + // Cache control + enum PrefetchLevel + { + PF_NTA, // Non-temporal (data used once and only once) + PF_T0, // All cache levels + PF_T1, // Levels 2+ (aliased to T0 on AMD) + PF_T2, // Levels 3+ (aliased to T0 on AMD) + }; + void PREFETCH(PrefetchLevel level, OpArg arg); + void MOVNTI(int bits, const OpArg& dest, X64Reg src); + void MOVNTDQ(const OpArg& arg, X64Reg regOp); + void MOVNTPS(const OpArg& arg, X64Reg regOp); + void MOVNTPD(const OpArg& arg, X64Reg regOp); + + // Multiplication / division + void MUL(int bits, const OpArg& src); // UNSIGNED + void IMUL(int bits, const OpArg& src); // SIGNED + void IMUL(int bits, X64Reg regOp, const OpArg& src); + void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm); + void DIV(int bits, const OpArg& src); + void IDIV(int bits, const OpArg& src); + + // Shift + void ROL(int bits, const OpArg& dest, const OpArg& shift); + void ROR_(int bits, const OpArg& dest, const OpArg& shift); + void RCL(int bits, const OpArg& dest, const OpArg& shift); + void RCR(int bits, const OpArg& dest, const OpArg& shift); + void SHL(int bits, const OpArg& dest, const OpArg& shift); + void SHR(int bits, const OpArg& dest, const OpArg& shift); + void SAR(int bits, const OpArg& dest, const OpArg& shift); + + // Bit Test + void BT(int bits, const OpArg& dest, const OpArg& index); + void BTS(int bits, const OpArg& dest, const OpArg& index); + void BTR(int bits, const OpArg& dest, const OpArg& index); + void BTC(int bits, const OpArg& dest, const OpArg& index); + + // Double-Precision Shift + void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift); + void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift); + + // Extend EAX into EDX in various ways + void CWD(int bits = 16); + inline void CDQ() { CWD(32); } + inline void CQO() { CWD(64); } + void CBW(int bits = 8); + inline void CWDE() { CBW(16); } + inline void CDQE() { CBW(32); } + // Load effective address + void LEA(int bits, X64Reg dest, OpArg src); + + // Integer arithmetic + void NEG(int bits, const OpArg& src); + void ADD(int bits, const OpArg& a1, const OpArg& a2); + void ADC(int bits, const OpArg& a1, const OpArg& a2); + void SUB(int bits, const OpArg& a1, const OpArg& a2); + void SBB(int bits, const OpArg& a1, const OpArg& a2); + void AND(int bits, const OpArg& a1, const OpArg& a2); + void CMP(int bits, const OpArg& a1, const OpArg& a2); + + // Bit operations + void NOT(int bits, const OpArg& src); + void OR(int bits, const OpArg& a1, const OpArg& a2); + void XOR(int bits, const OpArg& a1, const OpArg& a2); + void MOV(int bits, const OpArg& a1, const OpArg& a2); + void TEST(int bits, const OpArg& a1, const OpArg& a2); + + void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2); + void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2); + + // Are these useful at all? Consider removing. + void XCHG(int bits, const OpArg& a1, const OpArg& a2); + void XCHG_AHAL(); + + // Byte swapping (32 and 64-bit only). + void BSWAP(int bits, X64Reg reg); + + // Sign/zero extension + void MOVSX(int dbits, int sbits, X64Reg dest, + OpArg src); // automatically uses MOVSXD if necessary + void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); + + // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE. + void MOVBE(int bits, X64Reg dest, const OpArg& src); + void MOVBE(int bits, const OpArg& dest, X64Reg src); + void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false, + MovInfo* info = nullptr); + void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr); + + // Available only on AMD >= Phenom or Intel >= Haswell + void LZCNT(int bits, X64Reg dest, const OpArg& src); + // Note: this one is actually part of BMI1 + void TZCNT(int bits, X64Reg dest, const OpArg& src); + + // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) + void STMXCSR(const OpArg& memloc); + void LDMXCSR(const OpArg& memloc); + + // Prefixes + void LOCK(); + void REP(); + void REPNE(); + void FSOverride(); + void GSOverride(); + + // x87 + enum x87StatusWordBits + { + x87_InvalidOperation = 0x1, + x87_DenormalizedOperand = 0x2, + x87_DivisionByZero = 0x4, + x87_Overflow = 0x8, + x87_Underflow = 0x10, + x87_Precision = 0x20, + x87_StackFault = 0x40, + x87_ErrorSummary = 0x80, + x87_C0 = 0x100, + x87_C1 = 0x200, + x87_C2 = 0x400, + x87_TopOfStack = 0x2000 | 0x1000 | 0x800, + x87_C3 = 0x4000, + x87_FPUBusy = 0x8000, + }; + + void FLD(int bits, const OpArg& src); + void FST(int bits, const OpArg& dest); + void FSTP(int bits, const OpArg& dest); + void FNSTSW_AX(); + void FWAIT(); + + // SSE/SSE2: Floating point arithmetic + void ADDSS(X64Reg regOp, const OpArg& arg); + void ADDSD(X64Reg regOp, const OpArg& arg); + void SUBSS(X64Reg regOp, const OpArg& arg); + void SUBSD(X64Reg regOp, const OpArg& arg); + void MULSS(X64Reg regOp, const OpArg& arg); + void MULSD(X64Reg regOp, const OpArg& arg); + void DIVSS(X64Reg regOp, const OpArg& arg); + void DIVSD(X64Reg regOp, const OpArg& arg); + void MINSS(X64Reg regOp, const OpArg& arg); + void MINSD(X64Reg regOp, const OpArg& arg); + void MAXSS(X64Reg regOp, const OpArg& arg); + void MAXSD(X64Reg regOp, const OpArg& arg); + void SQRTSS(X64Reg regOp, const OpArg& arg); + void SQRTSD(X64Reg regOp, const OpArg& arg); + void RCPSS(X64Reg regOp, const OpArg& arg); + void RSQRTSS(X64Reg regOp, const OpArg& arg); + + // SSE/SSE2: Floating point bitwise (yes) + void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare); + void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare); + + // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) + void ADDPS(X64Reg regOp, const OpArg& arg); + void ADDPD(X64Reg regOp, const OpArg& arg); + void SUBPS(X64Reg regOp, const OpArg& arg); + void SUBPD(X64Reg regOp, const OpArg& arg); + void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare); + void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare); + void MULPS(X64Reg regOp, const OpArg& arg); + void MULPD(X64Reg regOp, const OpArg& arg); + void DIVPS(X64Reg regOp, const OpArg& arg); + void DIVPD(X64Reg regOp, const OpArg& arg); + void MINPS(X64Reg regOp, const OpArg& arg); + void MINPD(X64Reg regOp, const OpArg& arg); + void MAXPS(X64Reg regOp, const OpArg& arg); + void MAXPD(X64Reg regOp, const OpArg& arg); + void SQRTPS(X64Reg regOp, const OpArg& arg); + void SQRTPD(X64Reg regOp, const OpArg& arg); + void RCPPS(X64Reg regOp, const OpArg& arg); + void RSQRTPS(X64Reg regOp, const OpArg& arg); + + // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) + void ANDPS(X64Reg regOp, const OpArg& arg); + void ANDPD(X64Reg regOp, const OpArg& arg); + void ANDNPS(X64Reg regOp, const OpArg& arg); + void ANDNPD(X64Reg regOp, const OpArg& arg); + void ORPS(X64Reg regOp, const OpArg& arg); + void ORPD(X64Reg regOp, const OpArg& arg); + void XORPS(X64Reg regOp, const OpArg& arg); + void XORPD(X64Reg regOp, const OpArg& arg); + + // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. + void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle); + void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle); + + // SSE3 + void MOVSLDUP(X64Reg regOp, const OpArg& arg); + void MOVSHDUP(X64Reg regOp, const OpArg& arg); + void MOVDDUP(X64Reg regOp, const OpArg& arg); + + // SSE/SSE2: Useful alternative to shuffle in some cases. + void UNPCKLPS(X64Reg dest, const OpArg& src); + void UNPCKHPS(X64Reg dest, const OpArg& src); + void UNPCKLPD(X64Reg dest, const OpArg& src); + void UNPCKHPD(X64Reg dest, const OpArg& src); + + // SSE/SSE2: Compares. + void COMISS(X64Reg regOp, const OpArg& arg); + void COMISD(X64Reg regOp, const OpArg& arg); + void UCOMISS(X64Reg regOp, const OpArg& arg); + void UCOMISD(X64Reg regOp, const OpArg& arg); + + // SSE/SSE2: Moves. Use the right data type for your data, in most cases. + void MOVAPS(X64Reg regOp, const OpArg& arg); + void MOVAPD(X64Reg regOp, const OpArg& arg); + void MOVAPS(const OpArg& arg, X64Reg regOp); + void MOVAPD(const OpArg& arg, X64Reg regOp); + + void MOVUPS(X64Reg regOp, const OpArg& arg); + void MOVUPD(X64Reg regOp, const OpArg& arg); + void MOVUPS(const OpArg& arg, X64Reg regOp); + void MOVUPD(const OpArg& arg, X64Reg regOp); + + void MOVDQA(X64Reg regOp, const OpArg& arg); + void MOVDQA(const OpArg& arg, X64Reg regOp); + void MOVDQU(X64Reg regOp, const OpArg& arg); + void MOVDQU(const OpArg& arg, X64Reg regOp); + + void MOVSS(X64Reg regOp, const OpArg& arg); + void MOVSD(X64Reg regOp, const OpArg& arg); + void MOVSS(const OpArg& arg, X64Reg regOp); + void MOVSD(const OpArg& arg, X64Reg regOp); + + void MOVLPS(X64Reg regOp, const OpArg& arg); + void MOVLPD(X64Reg regOp, const OpArg& arg); + void MOVLPS(const OpArg& arg, X64Reg regOp); + void MOVLPD(const OpArg& arg, X64Reg regOp); + + void MOVHPS(X64Reg regOp, const OpArg& arg); + void MOVHPD(X64Reg regOp, const OpArg& arg); + void MOVHPS(const OpArg& arg, X64Reg regOp); + void MOVHPD(const OpArg& arg, X64Reg regOp); + + void MOVHLPS(X64Reg regOp1, X64Reg regOp2); + void MOVLHPS(X64Reg regOp1, X64Reg regOp2); + + // Be careful when using these overloads for reg <--> xmm moves. + // The one you cast to OpArg with R(reg) is the x86 reg, the other + // one is the xmm reg. + // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx) + // use "MOVD_xmm(R(eax), xmm1)" instead. + void MOVD_xmm(X64Reg dest, const OpArg& arg); + void MOVQ_xmm(X64Reg dest, OpArg arg); + void MOVD_xmm(const OpArg& arg, X64Reg src); + void MOVQ_xmm(OpArg arg, X64Reg src); + + // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in + // question. + void MOVMSKPS(X64Reg dest, const OpArg& arg); + void MOVMSKPD(X64Reg dest, const OpArg& arg); + + // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a + // weird one. + void MASKMOVDQU(X64Reg dest, X64Reg src); + void LDDQU(X64Reg dest, const OpArg& src); + + // SSE/SSE2: Data type conversions. + void CVTPS2PD(X64Reg dest, const OpArg& src); + void CVTPD2PS(X64Reg dest, const OpArg& src); + void CVTSS2SD(X64Reg dest, const OpArg& src); + void CVTSI2SS(X64Reg dest, const OpArg& src); + void CVTSD2SS(X64Reg dest, const OpArg& src); + void CVTSI2SD(X64Reg dest, const OpArg& src); + void CVTDQ2PD(X64Reg regOp, const OpArg& arg); + void CVTPD2DQ(X64Reg regOp, const OpArg& arg); + void CVTDQ2PS(X64Reg regOp, const OpArg& arg); + void CVTPS2DQ(X64Reg regOp, const OpArg& arg); + + void CVTTPS2DQ(X64Reg regOp, const OpArg& arg); + void CVTTPD2DQ(X64Reg regOp, const OpArg& arg); + + // Destinations are X64 regs (rax, rbx, ...) for these instructions. + void CVTSS2SI(X64Reg xregdest, const OpArg& src); + void CVTSD2SI(X64Reg xregdest, const OpArg& src); + void CVTTSS2SI(X64Reg xregdest, const OpArg& arg); + void CVTTSD2SI(X64Reg xregdest, const OpArg& arg); + + // SSE2: Packed integer instructions + void PACKSSDW(X64Reg dest, const OpArg& arg); + void PACKSSWB(X64Reg dest, const OpArg& arg); + void PACKUSDW(X64Reg dest, const OpArg& arg); + void PACKUSWB(X64Reg dest, const OpArg& arg); + + void PUNPCKLBW(X64Reg dest, const OpArg& arg); + void PUNPCKLWD(X64Reg dest, const OpArg& arg); + void PUNPCKLDQ(X64Reg dest, const OpArg& arg); + void PUNPCKLQDQ(X64Reg dest, const OpArg& arg); + + void PTEST(X64Reg dest, const OpArg& arg); + void PAND(X64Reg dest, const OpArg& arg); + void PANDN(X64Reg dest, const OpArg& arg); + void PXOR(X64Reg dest, const OpArg& arg); + void POR(X64Reg dest, const OpArg& arg); + + void PADDB(X64Reg dest, const OpArg& arg); + void PADDW(X64Reg dest, const OpArg& arg); + void PADDD(X64Reg dest, const OpArg& arg); + void PADDQ(X64Reg dest, const OpArg& arg); + + void PADDSB(X64Reg dest, const OpArg& arg); + void PADDSW(X64Reg dest, const OpArg& arg); + void PADDUSB(X64Reg dest, const OpArg& arg); + void PADDUSW(X64Reg dest, const OpArg& arg); + + void PSUBB(X64Reg dest, const OpArg& arg); + void PSUBW(X64Reg dest, const OpArg& arg); + void PSUBD(X64Reg dest, const OpArg& arg); + void PSUBQ(X64Reg dest, const OpArg& arg); + + void PSUBSB(X64Reg dest, const OpArg& arg); + void PSUBSW(X64Reg dest, const OpArg& arg); + void PSUBUSB(X64Reg dest, const OpArg& arg); + void PSUBUSW(X64Reg dest, const OpArg& arg); + + void PAVGB(X64Reg dest, const OpArg& arg); + void PAVGW(X64Reg dest, const OpArg& arg); + + void PCMPEQB(X64Reg dest, const OpArg& arg); + void PCMPEQW(X64Reg dest, const OpArg& arg); + void PCMPEQD(X64Reg dest, const OpArg& arg); + + void PCMPGTB(X64Reg dest, const OpArg& arg); + void PCMPGTW(X64Reg dest, const OpArg& arg); + void PCMPGTD(X64Reg dest, const OpArg& arg); + + void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg); + void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg); + void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg); + + void PMADDWD(X64Reg dest, const OpArg& arg); + void PSADBW(X64Reg dest, const OpArg& arg); + + void PMAXSW(X64Reg dest, const OpArg& arg); + void PMAXUB(X64Reg dest, const OpArg& arg); + void PMINSW(X64Reg dest, const OpArg& arg); + void PMINUB(X64Reg dest, const OpArg& arg); + + void PMOVMSKB(X64Reg dest, const OpArg& arg); + void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle); + void PSHUFB(X64Reg dest, const OpArg& arg); + + void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle); + void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle); + + void PSRLW(X64Reg reg, int shift); + void PSRLD(X64Reg reg, int shift); + void PSRLQ(X64Reg reg, int shift); + void PSRLQ(X64Reg reg, const OpArg& arg); + void PSRLDQ(X64Reg reg, int shift); + + void PSLLW(X64Reg reg, int shift); + void PSLLD(X64Reg reg, int shift); + void PSLLQ(X64Reg reg, int shift); + void PSLLDQ(X64Reg reg, int shift); + + void PSRAW(X64Reg reg, int shift); + void PSRAD(X64Reg reg, int shift); + + // SSE4: data type conversions + void PMOVSXBW(X64Reg dest, const OpArg& arg); + void PMOVSXBD(X64Reg dest, const OpArg& arg); + void PMOVSXBQ(X64Reg dest, const OpArg& arg); + void PMOVSXWD(X64Reg dest, const OpArg& arg); + void PMOVSXWQ(X64Reg dest, const OpArg& arg); + void PMOVSXDQ(X64Reg dest, const OpArg& arg); + void PMOVZXBW(X64Reg dest, const OpArg& arg); + void PMOVZXBD(X64Reg dest, const OpArg& arg); + void PMOVZXBQ(X64Reg dest, const OpArg& arg); + void PMOVZXWD(X64Reg dest, const OpArg& arg); + void PMOVZXWQ(X64Reg dest, const OpArg& arg); + void PMOVZXDQ(X64Reg dest, const OpArg& arg); + + // SSE4: blend instructions + void PBLENDVB(X64Reg dest, const OpArg& arg); + void BLENDVPS(X64Reg dest, const OpArg& arg); + void BLENDVPD(X64Reg dest, const OpArg& arg); + void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); + void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); + + // AVX + void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare); + void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle); + void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle); + void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask); + void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend); + void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend); + + void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + + void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + + // FMA3 + void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + +#define FMA4(name) \ + void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); \ + void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + + FMA4(VFMADDSUBPS) + FMA4(VFMADDSUBPD) + FMA4(VFMSUBADDPS) + FMA4(VFMSUBADDPD) + FMA4(VFMADDPS) + FMA4(VFMADDPD) + FMA4(VFMADDSS) + FMA4(VFMADDSD) + FMA4(VFMSUBPS) + FMA4(VFMSUBPD) + FMA4(VFMSUBSS) + FMA4(VFMSUBSD) + FMA4(VFNMADDPS) + FMA4(VFNMADDPD) + FMA4(VFNMADDSS) + FMA4(VFNMADDSD) + FMA4(VFNMSUBPS) + FMA4(VFNMSUBPD) + FMA4(VFNMSUBSS) + FMA4(VFNMSUBSD) +#undef FMA4 + + // VEX GPR instructions + void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate); + void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void BLSR(int bits, X64Reg regOp, const OpArg& arg); + void BLSMSK(int bits, X64Reg regOp, const OpArg& arg); + void BLSI(int bits, X64Reg regOp, const OpArg& arg); + void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2); + void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + + void RDTSC(); + + // Utility functions + // The difference between this and CALL is that this aligns the stack + // where appropriate. + template <typename FunctionPointer> + void ABI_CallFunction(FunctionPointer func) + { + static_assert(std::is_pointer<FunctionPointer>() && + std::is_function<std::remove_pointer_t<FunctionPointer>>(), + "Supplied type must be a function pointer."); + + const void* ptr = reinterpret_cast<const void*>(func); + const u64 address = reinterpret_cast<u64>(ptr); + const u64 distance = address - (reinterpret_cast<u64>(code) + 5); + + if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL) + { + // Far call + MOV(64, R(RAX), Imm64(address)); + CALLptr(R(RAX)); + } + else + { + CALL(ptr); + } + } + + template <typename FunctionPointer> + void ABI_CallFunctionC16(FunctionPointer func, u16 param1) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionC(FunctionPointer func, u32 param1) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2))); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3))); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3, + const void* param4) + { + MOV(32, R(ABI_PARAM1), Imm32(param1)); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4))); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2) + { + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1))); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3) + { + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1))); + MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2))); + MOV(32, R(ABI_PARAM3), Imm32(param3)); + ABI_CallFunction(func); + } + + // Pass a register as a parameter. + template <typename FunctionPointer> + void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1) + { + if (reg1 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(reg1)); + ABI_CallFunction(func); + } + + // Pass two registers as parameters. + template <typename FunctionPointer> + void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2) + { + MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2) + { + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(bits, R(ABI_PARAM1), arg1); + MOV(32, R(ABI_PARAM2), Imm32(param2)); + ABI_CallFunction(func); + } + + template <typename FunctionPointer> + void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1) + { + if (!arg1.IsSimpleReg(ABI_PARAM1)) + MOV(bits, R(ABI_PARAM1), arg1); + ABI_CallFunction(func); + } + + // Helper method for ABI functions related to calling functions. May be used by itself as well. + void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2); + + // Saves/restores the registers and adjusts the stack to be aligned as + // required by the ABI, where the previous alignment was as specified. + // Push returns the size of the shadow space, i.e. the offset of the frame. + size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, + size_t needed_frame_size = 0); + void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, + size_t needed_frame_size = 0); + + // Utility to generate a call to a std::function object. + // + // Unfortunately, calling operator() directly is undefined behavior in C++ + // (this method might be a thunk in the case of multi-inheritance) so we + // have to go through a trampoline function. + template <typename T, typename... Args> + static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args) + { + return (*f)(args...); + } + + template <typename T, typename... Args> + void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1) + { + auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>; + ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1); + } +}; // class XEmitter + +} // namespace diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h new file mode 100644 index 0000000..a92e024 --- /dev/null +++ b/src/dolphin/x64Reg.h @@ -0,0 +1,96 @@ +// Copyright 2016 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license_dolphin.txt file included. + +#pragma once + +namespace Gen +{ +enum X64Reg +{ + EAX = 0, + EBX = 3, + ECX = 1, + EDX = 2, + ESI = 6, + EDI = 7, + EBP = 5, + ESP = 4, + + RAX = 0, + RBX = 3, + RCX = 1, + RDX = 2, + RSI = 6, + RDI = 7, + RBP = 5, + RSP = 4, + R8 = 8, + R9 = 9, + R10 = 10, + R11 = 11, + R12 = 12, + R13 = 13, + R14 = 14, + R15 = 15, + + AL = 0, + BL = 3, + CL = 1, + DL = 2, + SIL = 6, + DIL = 7, + BPL = 5, + SPL = 4, + AH = 0x104, + BH = 0x107, + CH = 0x105, + DH = 0x106, + + AX = 0, + BX = 3, + CX = 1, + DX = 2, + SI = 6, + DI = 7, + BP = 5, + SP = 4, + + XMM0 = 0, + XMM1, + XMM2, + XMM3, + XMM4, + XMM5, + XMM6, + XMM7, + XMM8, + XMM9, + XMM10, + XMM11, + XMM12, + XMM13, + XMM14, + XMM15, + + YMM0 = 0, + YMM1, + YMM2, + YMM3, + YMM4, + YMM5, + YMM6, + YMM7, + YMM8, + YMM9, + YMM10, + YMM11, + YMM12, + YMM13, + YMM14, + YMM15, + + INVALID_REG = 0xFFFFFFFF +}; + +} // namespace Gen diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp index 09faf4e..9ee7b9a 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp +++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp @@ -32,6 +32,7 @@ EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr; extern char* EmuDirectory; +extern bool RunningSomething; EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog) @@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType); ui->chkDirectBoot->setChecked(Config::DirectBoot != 0); + +#ifdef JIT_ENABLED + ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0); + ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0); + ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0); + ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0); + ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize); +#else + ui->chkEnableJIT->setDisabled(true); + ui->chkJITBranchOptimisations->setDisabled(true); + ui->chkJITLiteralOptimisations->setDisabled(true); + ui->chkJITFastMemory->setDisabled(true); + ui->spnJITMaximumBlockSize->setDisabled(true); +#endif + + on_chkEnableJIT_toggled(); } EmuSettingsDialog::~EmuSettingsDialog() @@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware() } } -void EmuSettingsDialog::on_EmuSettingsDialog_accepted() +void EmuSettingsDialog::done(int r) { - verifyFirmware(); - - strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0'; - strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0'; - strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0'; - - strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; - strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; - strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; - strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; - - Config::ConsoleType = ui->cbxConsoleType->currentIndex(); - Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0; - - Config::Save(); + if (r == QDialog::Accepted) + { + verifyFirmware(); + + int consoleType = ui->cbxConsoleType->currentIndex(); + int directBoot = ui->chkDirectBoot->isChecked() ? 1:0; + + int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0; + int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value(); + int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0; + int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0; + int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0; + + std::string bios9Path = ui->txtBIOS9Path->text().toStdString(); + std::string bios7Path = ui->txtBIOS7Path->text().toStdString(); + std::string firmwarePath = ui->txtFirmwarePath->text().toStdString(); + std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString(); + std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString(); + std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString(); + std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString(); + + if (consoleType != Config::ConsoleType + || directBoot != Config::DirectBoot +#ifdef JIT_ENABLED + || jitEnable != Config::JIT_Enable + || jitMaxBlockSize != Config::JIT_MaxBlockSize + || jitBranchOptimisations != Config::JIT_BranchOptimisations + || jitLiteralOptimisations != Config::JIT_LiteralOptimisations + || jitFastMemory != Config::JIT_FastMemory +#endif + || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0 + || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0 + || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0 + || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0 + || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0 + || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0 + || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0) + { + if (RunningSomething + && QMessageBox::warning(this, "Reset necessary to apply changes", + "The emulation will be reset for the changes to take place", + QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes) + return; + + strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0'; + strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0'; + strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0'; + + strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; + strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; + strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; + strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; + + #ifdef JIT_ENABLED + Config::JIT_Enable = jitEnable; + Config::JIT_MaxBlockSize = jitMaxBlockSize; + Config::JIT_BranchOptimisations = jitBranchOptimisations; + Config::JIT_LiteralOptimisations = jitLiteralOptimisations; + Config::JIT_FastMemory = jitFastMemory; + #endif + + Config::ConsoleType = consoleType; + Config::DirectBoot = directBoot; + + Config::Save(); + } + } - closeDlg(); -} + QDialog::done(r); -void EmuSettingsDialog::on_EmuSettingsDialog_rejected() -{ closeDlg(); } @@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked() ui->txtDSiNANDPath->setText(file); } + +void EmuSettingsDialog::on_chkEnableJIT_toggled() +{ + bool disabled = !ui->chkEnableJIT->isChecked(); + ui->chkJITBranchOptimisations->setDisabled(disabled); + ui->chkJITLiteralOptimisations->setDisabled(disabled); + ui->chkJITFastMemory->setDisabled(disabled); + ui->spnJITMaximumBlockSize->setDisabled(disabled); +}
\ No newline at end of file diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h index f604ba5..268036c 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.h +++ b/src/frontend/qt_sdl/EmuSettingsDialog.h @@ -51,8 +51,7 @@ public: } private slots: - void on_EmuSettingsDialog_accepted(); - void on_EmuSettingsDialog_rejected(); + void done(int r); void on_btnBIOS9Browse_clicked(); void on_btnBIOS7Browse_clicked(); @@ -63,6 +62,8 @@ private slots: void on_btnDSiFirmwareBrowse_clicked(); void on_btnDSiNANDBrowse_clicked(); + void on_chkEnableJIT_toggled(); + private: void verifyFirmware(); diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui index 4894fa5..11d48cc 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.ui +++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui @@ -6,8 +6,8 @@ <rect> <x>0</x> <y>0</y> - <width>490</width> - <height>392</height> + <width>514</width> + <height>359</height> </rect> </property> <property name="sizePolicy"> @@ -24,243 +24,336 @@ <enum>QLayout::SetFixedSize</enum> </property> <item> - <widget class="QGroupBox" name="groupBox"> - <property name="title"> - <string>DS mode</string> + <widget class="QTabWidget" name="tabWidget"> + <property name="currentIndex"> + <number>0</number> </property> - <layout class="QGridLayout" name="gridLayout_2"> - <item row="0" column="1"> - <widget class="QLineEdit" name="txtBIOS9Path"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="minimumSize"> - <size> - <width>290</width> - <height>0</height> - </size> - </property> - <property name="statusTip"> - <string/> - </property> - <property name="whatsThis"> - <string><html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html></string> - </property> - </widget> - </item> - <item row="2" column="0"> - <widget class="QLabel" name="label_3"> - <property name="text"> - <string>DS firmware:</string> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QLabel" name="label_2"> - <property name="text"> - <string>DS ARM7 BIOS:</string> - </property> - </widget> - </item> - <item row="0" column="0"> - <widget class="QLabel" name="label"> - <property name="text"> - <string>DS ARM9 BIOS:</string> - </property> - </widget> - </item> - <item row="0" column="2"> - <widget class="QPushButton" name="btnBIOS9Browse"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Minimum" vsizetype="Fixed"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string>Browse...</string> - </property> - <property name="autoDefault"> - <bool>true</bool> - </property> - </widget> - </item> - <item row="1" column="1"> - <widget class="QLineEdit" name="txtBIOS7Path"> - <property name="whatsThis"> - <string><html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html></string> - </property> - </widget> - </item> - <item row="1" column="2"> - <widget class="QPushButton" name="btnBIOS7Browse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - <item row="2" column="1"> - <widget class="QLineEdit" name="txtFirmwarePath"> - <property name="whatsThis"> - <string><html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html></string> - </property> - </widget> - </item> - <item row="2" column="2"> - <widget class="QPushButton" name="btnFirmwareBrowse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - </layout> - </widget> - </item> - <item> - <widget class="QGroupBox" name="groupBox_3"> - <property name="title"> - <string>DSi mode</string> - </property> - <layout class="QGridLayout" name="gridLayout_3"> - <item row="0" column="2"> - <widget class="QPushButton" name="btnDSiBIOS9Browse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - <item row="0" column="0"> - <widget class="QLabel" name="label_5"> - <property name="text"> - <string>DSi ARM9 BIOS:</string> - </property> - </widget> - </item> - <item row="2" column="2"> - <widget class="QPushButton" name="btnDSiFirmwareBrowse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - <item row="1" column="1"> - <widget class="QLineEdit" name="txtDSiBIOS7Path"> - <property name="whatsThis"> - <string><html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html></string> - </property> - </widget> - </item> - <item row="2" column="1"> - <widget class="QLineEdit" name="txtDSiFirmwarePath"> - <property name="whatsThis"> - <string><html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html></string> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QLabel" name="label_6"> - <property name="text"> - <string>DSi ARM7 BIOS:</string> - </property> - </widget> - </item> - <item row="2" column="0"> - <widget class="QLabel" name="label_7"> - <property name="text"> - <string>DSi firmware:</string> - </property> - </widget> - </item> - <item row="1" column="2"> - <widget class="QPushButton" name="btnDSiBIOS7Browse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QLineEdit" name="txtDSiBIOS9Path"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="whatsThis"> - <string><html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html></string> - </property> - </widget> - </item> - <item row="3" column="0"> - <widget class="QLabel" name="label_8"> - <property name="text"> - <string>DSi NAND:</string> - </property> - </widget> - </item> - <item row="3" column="1"> - <widget class="QLineEdit" name="txtDSiNANDPath"> - <property name="whatsThis"> - <string><html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html></string> - </property> - </widget> - </item> - <item row="3" column="2"> - <widget class="QPushButton" name="btnDSiNANDBrowse"> - <property name="text"> - <string>Browse...</string> - </property> - </widget> - </item> - </layout> - </widget> - </item> - <item> - <widget class="QGroupBox" name="groupBox_2"> - <property name="title"> - <string>General</string> - </property> - <layout class="QGridLayout" name="gridLayout"> - <item row="0" column="0"> - <widget class="QLabel" name="label_4"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Fixed"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string>Console type:</string> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QComboBox" name="cbxConsoleType"> - <property name="sizePolicy"> - <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="whatsThis"> - <string><html><head/><body><p>The type of console to emulate</p></body></html></string> - </property> - </widget> - </item> - <item row="1" column="0" colspan="2"> - <widget class="QCheckBox" name="chkDirectBoot"> - <property name="whatsThis"> - <string><html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html></string> - </property> - <property name="text"> - <string>Boot game directly</string> - </property> - </widget> - </item> - </layout> + <widget class="QWidget" name="tab"> + <attribute name="title"> + <string>General</string> + </attribute> + <layout class="QFormLayout" name="formLayout_4"> + <item row="1" column="1"> + <widget class="QComboBox" name="cbxConsoleType"> + <property name="sizePolicy"> + <sizepolicy hsizetype="MinimumExpanding" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="whatsThis"> + <string><html><head/><body><p>The type of console to emulate</p></body></html></string> + </property> + </widget> + </item> + <item row="2" column="1"> + <widget class="QCheckBox" name="chkDirectBoot"> + <property name="whatsThis"> + <string><html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html></string> + </property> + <property name="text"> + <string>Boot game directly</string> + </property> + </widget> + </item> + <item row="3" column="0"> + <spacer name="verticalSpacer_2"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + <property name="sizeHint" stdset="0"> + <size> + <width>20</width> + <height>40</height> + </size> + </property> + </spacer> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_4"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="text"> + <string>Console type:</string> + </property> + </widget> + </item> + </layout> + </widget> + <widget class="QWidget" name="tab_2"> + <attribute name="title"> + <string>BIOS Files</string> + </attribute> + <layout class="QVBoxLayout" name="verticalLayout_2"> + <item> + <widget class="QGroupBox" name="groupBox"> + <property name="title"> + <string>DS mode</string> + </property> + <layout class="QGridLayout" name="gridLayout_2"> + <item row="2" column="0"> + <widget class="QLabel" name="label_3"> + <property name="text"> + <string>DS firmware:</string> + </property> + </widget> + </item> + <item row="2" column="1"> + <widget class="QLineEdit" name="txtFirmwarePath"> + <property name="whatsThis"> + <string><html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html></string> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QLineEdit" name="txtBIOS7Path"> + <property name="whatsThis"> + <string><html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html></string> + </property> + </widget> + </item> + <item row="0" column="2"> + <widget class="QPushButton" name="btnBIOS9Browse"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Minimum" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="text"> + <string>Browse...</string> + </property> + <property name="autoDefault"> + <bool>true</bool> + </property> + </widget> + </item> + <item row="2" column="2"> + <widget class="QPushButton" name="btnFirmwareBrowse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_2"> + <property name="text"> + <string>DS ARM7 BIOS:</string> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QLabel" name="label"> + <property name="text"> + <string>DS ARM9 BIOS:</string> + </property> + </widget> + </item> + <item row="1" column="2"> + <widget class="QPushButton" name="btnBIOS7Browse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + <item row="0" column="1"> + <widget class="QLineEdit" name="txtBIOS9Path"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="minimumSize"> + <size> + <width>290</width> + <height>0</height> + </size> + </property> + <property name="statusTip"> + <string/> + </property> + <property name="whatsThis"> + <string><html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html></string> + </property> + </widget> + </item> + </layout> + </widget> + </item> + <item> + <widget class="QGroupBox" name="groupBox_3"> + <property name="title"> + <string>DSi mode</string> + </property> + <layout class="QGridLayout" name="gridLayout_3"> + <item row="0" column="2"> + <widget class="QPushButton" name="btnDSiBIOS9Browse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QLabel" name="label_5"> + <property name="text"> + <string>DSi ARM9 BIOS:</string> + </property> + </widget> + </item> + <item row="2" column="2"> + <widget class="QPushButton" name="btnDSiFirmwareBrowse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QLineEdit" name="txtDSiBIOS7Path"> + <property name="whatsThis"> + <string><html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html></string> + </property> + </widget> + </item> + <item row="2" column="1"> + <widget class="QLineEdit" name="txtDSiFirmwarePath"> + <property name="whatsThis"> + <string><html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html></string> + </property> + </widget> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_6"> + <property name="text"> + <string>DSi ARM7 BIOS:</string> + </property> + </widget> + </item> + <item row="2" column="0"> + <widget class="QLabel" name="label_7"> + <property name="text"> + <string>DSi firmware:</string> + </property> + </widget> + </item> + <item row="1" column="2"> + <widget class="QPushButton" name="btnDSiBIOS7Browse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + <item row="0" column="1"> + <widget class="QLineEdit" name="txtDSiBIOS9Path"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Expanding" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="whatsThis"> + <string><html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html></string> + </property> + </widget> + </item> + <item row="3" column="0"> + <widget class="QLabel" name="label_8"> + <property name="text"> + <string>DSi NAND:</string> + </property> + </widget> + </item> + <item row="3" column="1"> + <widget class="QLineEdit" name="txtDSiNANDPath"> + <property name="whatsThis"> + <string><html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html></string> + </property> + </widget> + </item> + <item row="3" column="2"> + <widget class="QPushButton" name="btnDSiNANDBrowse"> + <property name="text"> + <string>Browse...</string> + </property> + </widget> + </item> + </layout> + </widget> + </item> + </layout> + </widget> + <widget class="QWidget" name="tab_3"> + <attribute name="title"> + <string>CPU Emulation</string> + </attribute> + <layout class="QFormLayout" name="formLayout_5"> + <item row="0" column="0"> + <widget class="QCheckBox" name="chkEnableJIT"> + <property name="text"> + <string>Enable JIT recompiler</string> + </property> + </widget> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_9"> + <property name="text"> + <string>Maximum JIT block size:</string> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QSpinBox" name="spnJITMaximumBlockSize"> + <property name="minimum"> + <number>1</number> + </property> + <property name="maximum"> + <number>32</number> + </property> + <property name="value"> + <number>32</number> + </property> + </widget> + </item> + <item row="2" column="0"> + <widget class="QCheckBox" name="chkJITBranchOptimisations"> + <property name="text"> + <string>Branch Optimisations</string> + </property> + </widget> + </item> + <item row="3" column="0"> + <widget class="QCheckBox" name="chkJITLiteralOptimisations"> + <property name="text"> + <string>Literal Optimisations</string> + </property> + </widget> + </item> + <item row="4" column="0"> + <widget class="QCheckBox" name="chkJITFastMemory"> + <property name="text"> + <string>Fast Memory</string> + </property> + </widget> + </item> + <item row="5" column="0"> + <spacer name="verticalSpacer"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + <property name="sizeHint" stdset="0"> + <size> + <width>20</width> + <height>40</height> + </size> + </property> + </spacer> + </item> + </layout> + </widget> </widget> </item> <item> @@ -275,6 +368,27 @@ </item> </layout> </widget> + <tabstops> + <tabstop>tabWidget</tabstop> + <tabstop>cbxConsoleType</tabstop> + <tabstop>chkDirectBoot</tabstop> + <tabstop>txtBIOS9Path</tabstop> + <tabstop>txtBIOS7Path</tabstop> + <tabstop>txtFirmwarePath</tabstop> + <tabstop>txtDSiBIOS9Path</tabstop> + <tabstop>txtDSiBIOS7Path</tabstop> + <tabstop>txtDSiFirmwarePath</tabstop> + <tabstop>txtDSiNANDPath</tabstop> + <tabstop>btnBIOS9Browse</tabstop> + <tabstop>btnBIOS7Browse</tabstop> + <tabstop>btnFirmwareBrowse</tabstop> + <tabstop>btnDSiBIOS9Browse</tabstop> + <tabstop>btnDSiBIOS7Browse</tabstop> + <tabstop>btnDSiFirmwareBrowse</tabstop> + <tabstop>btnDSiNANDBrowse</tabstop> + <tabstop>chkEnableJIT</tabstop> + <tabstop>spnJITMaximumBlockSize</tabstop> + </tabstops> <resources/> <connections> <connection> @@ -284,8 +398,8 @@ <slot>accept()</slot> <hints> <hint type="sourcelabel"> - <x>248</x> - <y>254</y> + <x>257</x> + <y>349</y> </hint> <hint type="destinationlabel"> <x>157</x> @@ -300,8 +414,8 @@ <slot>reject()</slot> <hints> <hint type="sourcelabel"> - <x>316</x> - <y>260</y> + <x>325</x> + <y>349</y> </hint> <hint type="destinationlabel"> <x>286</x> diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp index 06128d7..bfb3f97 100644 --- a/src/frontend/qt_sdl/PlatformConfig.cpp +++ b/src/frontend/qt_sdl/PlatformConfig.cpp @@ -72,6 +72,7 @@ char MicWavPath[1024]; char LastROMFolder[1024]; +bool EnableJIT; ConfigEntry PlatformConfigFile[] = { diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp index fa542ad..4557d0e 100644 --- a/src/frontend/qt_sdl/main.cpp +++ b/src/frontend/qt_sdl/main.cpp @@ -1641,7 +1641,14 @@ void MainWindow::onStop() void MainWindow::onOpenEmuSettings() { - EmuSettingsDialog::openDlg(this); + EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this); + connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished); +} + +void MainWindow::onEmuSettingsDialogFinished(int res) +{ + if (RunningSomething) + onReset(); } void MainWindow::onOpenInputConfig() diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h index 279aed8..eec2a48 100644 --- a/src/frontend/qt_sdl/main.h +++ b/src/frontend/qt_sdl/main.h @@ -199,6 +199,7 @@ private slots: void onStop(); void onOpenEmuSettings(); + void onEmuSettingsDialogFinished(int res); void onOpenInputConfig(); void onInputConfigFinished(int res); void onOpenVideoSettings(); diff --git a/src/version.h b/src/version.h index 6250601..9084606 100644 --- a/src/version.h +++ b/src/version.h @@ -19,7 +19,7 @@ #ifndef VERSION_H #define VERSION_H -#define MELONDS_VERSION "0.8.3" +#define MELONDS_VERSION "0.8.3-JIT" #define MELONDS_URL "http://melonds.kuribo64.net/" diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h new file mode 100644 index 0000000..5d5faf8 --- /dev/null +++ b/src/xxhash/xxh3.h @@ -0,0 +1,2390 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Development source file for `xxh3` + * Copyright (C) 2019-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Note: This file is separated for development purposes. + * It will be integrated into `xxhash.h` when development stage is completed. + * + * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 + */ + +#ifndef XXH3_H_1397135465 +#define XXH3_H_1397135465 + +/* === Dependencies === */ +#ifndef XXHASH_H_5627135585666179 +/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ +# undef XXH_INLINE_ALL /* avoid redefinition */ +# define XXH_INLINE_ALL +#endif +#include "xxhash.h" + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include <immintrin.h> +# elif defined(__SSE2__) +# include <emmintrin.h> +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include <arm_neon.h> +# undef inline +# endif +#elif defined(_MSC_VER) +# include <intrin.h> +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXH_SCALAR 0 /* Portable scalar version */ +#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ +#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ +#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ +#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ +#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */ + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXH_VECTOR XXH_NEON +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator. + * This is for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/* + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) \ + && !defined(__aarch64__) && !defined(__arm64__) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +# if defined(__s390x__) +# include <s390intrin.h> +# else +# include <altivec.h> +# endif + +# undef vector /* Undo the pollution */ + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/* + * Performs an unaligned load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/* Pseudorandom secret taken directly from FARSH */ +XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * Calculates a 32-bit to 64-bit long multiply. + * + * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) + * { + * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); + * } + */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include <intrin.h> +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/* + * Calculates a 64->128-bit long multiply. + * + * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/* + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/* Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * We don't need to (or want to) mix as much as XXH64. + * + * Short hashes are more evenly distributed, so it isn't necessary. + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + xxh_u64 const mixed = keyed * PRIME64_1; + return XXH3_avalanche(mixed); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len < 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 x = input64 ^ bitflip; + /* this mix is inspired by Pelle Evensen's rrmxmx */ + x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); + x *= 0x9FB21C651E98DF25ULL; + x ^= (x >> 35) + len ; + x *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(x, 28); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(8 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + __asm__ ("" : "+r" (seed64)); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); + + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret, + XXH3_accWidth_e accWidth) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[0] += data_vec; */ + __m512i const sum = _mm512_add_epi64(*xacc, data_vec); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_SSE2) + + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + if (accWidth == XXH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } + +#elif (XXH_VECTOR == XXH_VSX) + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXH3_acc_128bits */ + /* swap high and low halves */ +#ifdef __s390x__ + xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); +#else + xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_SSE2) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + + /* xacc[i] *= PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + } + } } + +#elif (XXH_VECTOR == XXH_VSX) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } + +#else /* scalar variant of Scrambler - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXH_PREFETCH_DIST 384 + +#ifdef __clang__ // for clang +# define XXH_PREFETCH_DIST_AVX512_64 320 +# define XXH_PREFETCH_DIST_AVX512_128 320 +#else // for gcc +# define XXH_PREFETCH_DIST_AVX512_64 640 +# define XXH_PREFETCH_DIST_AVX512_128 512 +#endif + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; +#if (XXH_VECTOR == XXH_AVX512) + if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64); + else XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128); +#else + XXH_PREFETCH(in + XXH_PREFETCH_DIST); +#endif + XXH3_accumulate_512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE, + accWidth); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; + /* Do not align on 8, so that the secret is different from the scrambler */ +#define XXH_SECRET_LASTACC_START 7 + XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + __asm__("" : "+r" (result64)); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + /* + * We need a separate pointer for the hack below. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8 *kSecretPtr = kSecret; + + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that kSecretPtr has been changed), the pipelines are used more efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + __asm__("" : "+r" (kSecretPtr)); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == kSecret); + + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64(customSecret + 16*i, lo); + XXH_writeLE64(customSecret + 16*i + 8, hi); + } +} + + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) +{ + return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + return XXH3_hashLong_64b_internal(input, len, secret, secretSize); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); +} + +/* === Public entry point === */ + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXH3 streaming === */ + + +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); +} + +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_64bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->acc[0] = PRIME32_3; + statePtr->acc[1] = PRIME64_1; + statePtr->acc[2] = PRIME64_2; + statePtr->acc[3] = PRIME64_3; + statePtr->acc[4] = PRIME64_4; + statePtr->acc[5] = PRIME32_2; + statePtr->acc[6] = PRIME64_5; + statePtr->acc[7] = PRIME32_1; + statePtr->seed = seed; + XXH_ASSERT(secret != NULL); + statePtr->secret = secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_FORCE_INLINE void +XXH3_consumeStripes( xxh_u64* acc, + XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, + const xxh_u8* input, size_t totalStripes, + const xxh_u8* secret, size_t secretLimit, + XXH3_accWidth_e accWidth) +{ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { + /* need a scrambling operation */ + size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); + XXH3_scrambleAcc(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); + *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); + *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; + } +} + +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* const bEnd = input + len; + + state->totalLen += len; + + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + /* input is now > XXH3_INTERNALBUFFER_SIZE */ + + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ + + /* + * There is some input left inside the internal buffer. + * Fill it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + state->bufferedSize = 0; + } + + /* Consume input by full buffer quantities */ + if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + input += XXH3_INTERNALBUFFER_SIZE; + } while (input<=limit); + } + + if (input < bEnd) { /* Some remaining input: buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= STRIPE_LEN) { + size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; + XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, totalNbStripes, + state->secret, state->secretLimit, + accWidth); + if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - STRIPE_LEN, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } + } else { /* bufferedSize < STRIPE_LEN */ + if (state->bufferedSize) { /* one last stripe */ + xxh_u8 lastStripe[STRIPE_LEN]; + size_t const catchupSize = STRIPE_LEN - state->bufferedSize; + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } } +} + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_64bits); + return XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + } + /* len <= XXH3_MIDSIZE_MAX: short code */ + if (state->seed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + xxh_u64 const mixedl = keyed_lo * PRIME64_1; + xxh_u64 const mixedh = keyed_hi * PRIME64_5; + XXH128_hash_t h128; + h128.low64 = XXH3_avalanche(mixedl); + h128.high64 = XXH3_avalanche(mixedh); + return h128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); + h128.high64 += m128.high64 * PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); + h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) +{ + return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, + const xxh_u8* secret, size_t secretSize) +{ + return XXH3_hashLong_128b_internal(input, len, secret, secretSize); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); +} + + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ + +/* + * All the functions are actually the same as for 64-bit streaming variant. + * The only difference is the finalizatiom routine. + */ + +static void +XXH3_128bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); +} + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_128bits); + XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + state->secret + state->secretLimit + STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* 128-bit utility functions */ + +#include <string.h> /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH3_H_1397135465 */ diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c new file mode 100644 index 0000000..0fae88c --- /dev/null +++ b/src/xxhash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h new file mode 100644 index 0000000..67a5887 --- /dev/null +++ b/src/xxhash/xxhash.h @@ -0,0 +1,1965 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* TODO: update */ +/* Notice extracted from xxHash homepage: + +xxHash is an extremely fast hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +Note: SMHasher's CRC32 implementation is not the fastest one. +Other speed-oriented implementations can be faster, +especially in combination with PCLMUL instruction: +https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such + * as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ +# ifdef XXH_NAMESPACE +# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" + /* + * Note: Alternative: #undef all symbols (it's a pretty large list). + * Without #error: it compiles, but functions are actually not inlined. + */ +# endif +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, but they must + * still be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and is a more dispersed action. + * Meanwhile, renaming can be achieved in a single block + */ +# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +/*! + * XXH_NAMESPACE, aka Namespace Emulation: + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 7 +#define XXH_VERSION_RELEASE 4 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Definitions +******************************/ +#include <stddef.h> /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint32_t XXH32_hash_t; +#else +# include <limits.h> +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +# endif +#endif + +/*! + * XXH32(): + * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + * The memory between input & input+length must be valid (allocated and read-accessible). + * "seed" can be used to alter the result predictably. + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +/******* Streaming *******/ + +/* + * Streaming functions generate the xxHash value from an incrememtal input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + */ + +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint64_t XXH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +#endif + +/*! + * XXH64(): + * Returns the 64-bit hash of sequence of length @length stored at memory + * address @input. + * @seed can be used to alter the result predictably. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation of an XXH + * state, for example, on the stack or in a struct. + * Never **ever** access members directly. + */ + +struct XXH32_state_s { + XXH32_hash_t total_len_32; + XXH32_hash_t large_len; + XXH32_hash_t v1; + XXH32_hash_t v2; + XXH32_hash_t v3; + XXH32_hash_t v4; + XXH32_hash_t mem32[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +struct XXH64_state_s { + XXH64_hash_t total_len; + XXH64_hash_t v1; + XXH64_hash_t v2; + XXH64_hash_t v3; + XXH64_hash_t v4; + XXH64_hash_t mem64[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved32; /* required for padding anyway */ + XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + + +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +/* ************************************************************************ + * XXH3 is a new hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * In general, expect XXH3 to run about ~2x faster on large inputs and >3x + * faster on small ones compared to XXH64, though exact differences depend on + * the platform. + * + * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash + * on all platforms. + * + * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. + * + * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run + * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The 128-bit version adds additional strength, but it is slightly slower. + * + * The XXH3 algorithm is still in development. + * The results it produces may still change in future versions. + * + * Results produced by v0.7.x are not comparable with results from v0.7.y. + * However, the API is completely stable, and it can safely be used for + * ephemeral data (local sessions). + * + * Avoid storing values in long-term storage until the algorithm is finalized. + * + * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if + * everything remains fine, its current format will be "frozen" and become the + * final one. + * + * After which, return values of XXH3 and XXH128 will no longer change in + * future versions. + * + * XXH3's return values will be officially finalized upon reaching v0.8.0. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +#ifdef XXH_NAMESPACE +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) + +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) + +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +#endif + +/* XXH3_64bits(): + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional + * collision. + * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid trivial sequences, such as repeating sequences and especially '\0', + * as this can cancel out itself. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXH3_SECRET_SIZE_MIN 136 +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* + * XXH3_64bits_withSeed(): + * This variant generates a custom secret on the fly based on the default + * secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * Note: seed==0 produces the same results as XXH3_64bits(). + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); + + +/* streaming 64-bit */ + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include <stdalign.h> +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +typedef struct XXH3_state_s XXH3_state_t; + +#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ +#define XXH3_INTERNALBUFFER_SIZE 256 +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /* used to store a custom secret generated from the seed. Makes state larger. + * Design might change */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + XXH32_hash_t bufferedSize; + XXH32_hash_t nbStripesPerBlock; + XXH32_hash_t nbStripesSoFar; + XXH32_hash_t secretLimit; + XXH32_hash_t reserved32; + XXH32_hash_t reserved32_2; + XXH64_hash_t totalLen; + XXH64_hash_t seed; + XXH64_hash_t reserved64; + /* note: there is some padding after due to alignment on 64 bytes */ + const unsigned char* secret; +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever possible. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + + +/* + * XXH3_64bits_reset(): + * Initialize with the default parameters. + * The result will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/* + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, and must outlive the hash streaming session, so + * be careful when using stack arrays. + * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); + + +/* 128-bit */ + +#ifdef XXH_NAMESPACE +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) + +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) + +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + +typedef struct { + XXH64_hash_t low64; + XXH64_hash_t high64; +} XXH128_hash_t; + +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); + + +/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * XXH128_cmp(): + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * return: >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 + */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be found in xxhash.c. + * + * However, code inlining requires the implementation to be visible to the + * compiler, usually within the header. + * + * As a workaround, xxhash.c used to be included within xxhash.h. This caused + * some issues with some build systems, especially ones which treat .c files + * as source files. + * + * Therefore, the implementation is now directly integrated within xxhash.h. + * Another small advantage is that xxhash.c is no longer needed in /include. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ +/*! + * XXH_FORCE_MEMORY_ACCESS: + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow to select a different access method for improved + * performance. + * Method 0 (default): + * Use `memcpy()`. Safe and portable. + * Method 1: + * `__attribute__((packed))` statement. It depends on compiler extensions + * and is therefore not portable. + * This method is safe if your compiler supports it, and *generally* as + * fast or faster than `memcpy`. + * Method 2: + * Direct access via cast. This method doesn't depend on the compiler but + * violates the C standard. + * It can generate buggy code on targets which do not support unaligned + * memory accesses. + * But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) + * Method 3: + * Byteshift. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. + * See https://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2 > 3) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*! + *XXH_ACCEPT_NULL_INPUT_POINTER: + * If the input pointer is NULL, xxHash's default behavior is to dereference it, + * triggering a segfault. + * When this macro is enabled, xxHash actively checks the input for a null pointer. + * If it is, the result for null input pointers is the same as a zero-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*! + * XXH_FORCE_ALIGN_CHECK: + * This is a minor performance trick, only useful with lots of very small keys. + * It means: check for aligned/unaligned input. + * The check costs one initial branch per hash; + * Set it to 0 when the input is guaranteed to be aligned or when alignment + * doesn't matter for performance. + * + * This option does not affect XXH3. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*! + * XXH_NO_INLINE_HINTS: + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ +#ifndef XXH_NO_INLINE_HINTS +# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ + || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +/*! + * XXH_REROLL: + * Whether to reroll XXH32_finalize, and XXH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. + */ +#ifndef XXH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXH_REROLL 1 +# else +# define XXH_REROLL 0 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! + * Modify the local functions below should you wish to use some other memory + * routines for malloc() and free() + */ +#include <stdlib.h> + +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void* p) { free(p); } + +/*! and for memcpy() */ +#include <string.h> +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include <limits.h> /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) \ + || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXH_NO_INLINE static __attribute__((noinline)) +# else +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +# endif +# else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* + * DEBUGLEVEL is expected to be defined externally, typically via the compiler's + * command line options. The value must be a number. + */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include <assert.h> /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + + +/* *** Memory access *** */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianess *** */ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/*! + * XXH_CPU_LITTLE_ENDIAN: + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, a runtime check (which is usually constant folded) + * is used instead. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +static int XXH_isLittleEndian(void) +{ + /* + * Nonstandard, but well-defined behavior in practice. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ + && __has_builtin(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= PRIME32_1; +#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * This inline assembly hack forces acc into a normal register. This is the + * only thing that prevents GCC and Clang from autovectorizing the XXH32 + * loop (pragmas and attributes don't work for some resason) without globally + * disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * How this hack works: + * __asm__("" // Declare an assembly block but don't declare any instructions + * : // However, as an Input/Output Operand, + * "+r" // constrain a read/write operand (+) as a general purpose register (r). + * (acc) // and set acc as the operand + * ); + * + * Because of the 'r', the compiler has promised that seed will be in a + * general purpose register and the '+' says that it will be 'read/write', + * so it has to assume it has changed. It is like volatile without all the + * loads and stores. + * + * Since the argument has to be in a normal register (not an SSE register), + * each time XXH32_round is called, it is impossible to vectorize. + */ + __asm__("" : "+r" (acc)); +#endif + return acc; +} + +/* mix all bits */ +static xxh_u32 XXH32_avalanche(xxh_u32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +static xxh_u32 +XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1 do { \ + h32 += (*ptr++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1; \ +} while (0) + +#define PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4; \ +} while (0) + + /* Compact rerolled version */ + if (XXH_REROLL) { + len &= 15; + while (len >= 4) { + PROCESS4; + len -= 4; + } + while (len > 0) { + PROCESS1; + --len; + } + return XXH32_avalanche(h32); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + XXH_ASSERT(0); + return h32; /* reaching this point is deemed impossible */ + } +} + +XXH_FORCE_INLINE xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)16; + } +#endif + + if (len>=16) { + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; + xxh_u32 v2 = seed + PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + xxh_u32 v1 = state->v1; + xxh_u32 v2 = state->v2; + xxh_u32 v3 = state->v3; + xxh_u32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + + +/*! + * XXH_REROLL_XXH64: + * Whether to reroll the XXH64_finalize() loop. + * + * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a + * performance gain on 64-bit hosts, as only one jump is required. + * + * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit + * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial + * to unroll. The code becomes ridiculously large (the largest function in the + * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is + * also slightly faster because it fits into cache better and is more likely + * to be inlined by the compiler. + * + * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. + */ +#ifndef XXH_REROLL_XXH64 +# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ + || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ + || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ + || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ + || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ + || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ +# define XXH_REROLL_XXH64 1 +# else +# define XXH_REROLL_XXH64 0 +# endif +#endif /* !defined(XXH_REROLL_XXH64) */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static xxh_u64 XXH64_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +static xxh_u64 +XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1_64 do { \ + h64 ^= (*ptr++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; \ +} while (0) + +#define PROCESS4_64 do { \ + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ + ptr += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; \ +} while (0) + +#define PROCESS8_64 do { \ + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ + ptr += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} while (0) + + /* Rerolled version for 32-bit targets is faster and much smaller. */ + if (XXH_REROLL || XXH_REROLL_XXH64) { + len &= 31; + while (len >= 8) { + PROCESS8_64; + len -= 8; + } + if (len >= 4) { + PROCESS4_64; + len -= 4; + } + while (len > 0) { + PROCESS1_64; + --len; + } + return XXH64_avalanche(h64); + } else { + switch(len & 31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + } + /* impossible to reach */ + XXH_ASSERT(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +XXH_FORCE_INLINE xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)32; + } +#endif + + if (len>=32) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; + xxh_u64 v2 = seed + PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved64, might be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = state->v1; + xxh_u64 v2 = state->v2; + xxh_u64 v3 = state->v3; + xxh_u64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + xxh_u64 const v1 = state->v1; + xxh_u64 const v2 = state->v2; + xxh_u64 const v3 = state->v3; + xxh_u64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} + + +/******* Canonical representation *******/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + + + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +#include "xxh3.h" + + +#endif /* XXH_NO_LONG_LONG */ + + +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} +#endif |