From c5c342c0091d9bf36500950a21585c5c98dd7d9d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 22 Jun 2019 01:28:32 +0200 Subject: JIT: base all instructions are interpreted --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 332 +++++++++++++++++++++++++++++++++++++ src/ARMJIT_x64/ARMJIT_Compiler.h | 54 ++++++ 2 files changed, 386 insertions(+) create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.cpp create mode 100644 src/ARMJIT_x64/ARMJIT_Compiler.h (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp new file mode 100644 index 0000000..fb2fda8 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -0,0 +1,332 @@ +#include "ARMJIT_Compiler.h" + +#include "../ARMInterpreter.h" + +#include + +using namespace Gen; + +namespace ARMJIT +{ + +const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13}; +const int RegCache::NativeRegsCount = 5; + +Compiler::Compiler() +{ + AllocCodeSpace(1024 * 1024 * 4); +} + +typedef void (Compiler::*CompileFunc)(); +typedef void (*InterpretFunc)(ARM*); + +void Compiler::LoadCPSR() +{ + assert(!CPSRDirty); + + MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); +} + +void Compiler::SaveCPSR() +{ + if (CPSRDirty) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); + CPSRDirty = false; + } +} + +CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +{ + if (IsAlmostFull()) + { + ResetBlocks(); + ResetCodePtr(); + } + + CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); + + ConstantCycles = 0; + Thumb = cpu->CPSR & 0x20; + Num = cpu->Num; + R15 = cpu->R[15]; + + ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + + MOV(64, R(RCPU), ImmPtr(cpu)); + XOR(32, R(RCycles), R(RCycles)); + + LoadCPSR(); + + for (int i = 0; i < instrsCount; i++) + { + R15 += Thumb ? 2 : 4; + CurrentInstr = instrs[i]; + + CompileFunc comp = NULL; + + if (comp == NULL || i == instrsCount - 1) + { + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr)); + if (i == instrsCount - 1) + { + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1])); + } + + SaveCPSR(); + } + + if (Thumb) + { + if (comp == NULL) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + + u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF; + ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + } + else + { + } + } + else + { + u32 cond = CurrentInstr.Cond(); + if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + ABI_CallFunction(ARMInterpreter::A_BLX_IMM); + } + else if (cond == 0xF) + AddCycles_C(); + else + { + FixupBranch skipExecute; + if (cond < 0xE) + { + if (cond >= 0x8) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(RSCRATCH3), R(RCPSR)); + SHR(32, R(RSCRATCH3), Imm8(28)); + MOV(32, R(RSCRATCH), Imm32(1)); + SHL(32, R(RSCRATCH), R(RSCRATCH3)); + TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); + + skipExecute = J_CC(CC_Z); + } + else + { + // could have used a LUT, but then where would be the fun? + BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))); + + skipExecute = J_CC(cond & 1 ? CC_C : CC_NC); + } + + } + + if (comp == NULL) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + + u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0); + ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + } + else + { + } + + FixupBranch skipFailed; + if (CurrentInstr.Cond() < 0xE) + { + skipFailed = J(); + SetJumpTarget(skipExecute); + + AddCycles_C(); + + SetJumpTarget(skipFailed); + } + } + } + + /* + we don't need to collect the interpreted cycles, + since all functions only add to it, the dispatcher + can take care of it. + */ + + if (comp == NULL && i != instrsCount - 1) + LoadCPSR(); + } + + SaveCPSR(); + + LEA(32, RAX, MDisp(RCycles, ConstantCycles)); + + ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + RET(); + + return res; +} + +void Compiler::Compile(RegCache& regs, const FetchedInstr& instr) +{ + const CompileFunc A_Comp[ARMInstrInfo::ak_Count] = + { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + }; + + const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL + }; +} + +void Compiler::AddCycles_C() +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles); + + if (CurrentInstr.Cond() < 0xE) + ADD(32, R(RCycles), Imm8(cycles)); + else + ConstantCycles += cycles; +} + +// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue +OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed) +{ + carryUsed = true; + + switch (op) + { + case 0: // LSL + if (amount > 0) + { + MOV(32, R(RSCRATCH), R(rm)); + SHL(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + return R(RSCRATCH); + } + else + { + carryUsed = false; + return R(rm); + } + case 1: // LSR + if (amount > 0) + { + MOV(32, R(RSCRATCH), R(rm)); + SHR(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), R(rm)); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), R(rm)); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + { + MOV(32, R(RSCRATCH2), R(rm)); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + else + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + if (amount > 0) + { + MOV(32, R(RSCRATCH), R(rm)); + ROR_(32, R(RSCRATCH), Imm8(amount)); + } + else + { + BT(32, R(RCPSR), Imm8(29)); + MOV(32, R(RSCRATCH), R(rm)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } +} + +void Compiler::A_Comp_ALU(const FetchedInstr& instr) +{ +} + +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h new file mode 100644 index 0000000..8e1d100 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -0,0 +1,54 @@ +#ifndef ARMJIT_COMPILER_H +#define ARMJIT_COMPILER_H + +#include "../dolphin/x64Emitter.h" + +#include "../ARMJIT.h" + + +namespace ARMJIT +{ + +const Gen::X64Reg RCPU = Gen::RBP; +const Gen::X64Reg RCycles = Gen::R14; +const Gen::X64Reg RCPSR = Gen::R15; + +const Gen::X64Reg RSCRATCH = Gen::EAX; +const Gen::X64Reg RSCRATCH2 = Gen::EDX; +const Gen::X64Reg RSCRATCH3 = Gen::ECX; + +class Compiler : public Gen::X64CodeBlock +{ +public: + Compiler(); + + CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + + void StartBlock(ARM* cpu); + CompiledBlock FinaliseBlock(); + + void Compile(RegCache& regs, const FetchedInstr& instr); +private: + void AddCycles_C(); + + Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed); + + void A_Comp_ALU(const FetchedInstr& instr); + + void LoadCPSR(); + void SaveCPSR(); + + bool CPSRDirty = false; + + FetchedInstr CurrentInstr; + + bool Thumb; + u32 Num; + u32 R15; + + u32 ConstantCycles; +}; + +} + +#endif \ No newline at end of file -- cgit v1.2.3 From ebce9f035ff05b414f1bb895beabb62bc539ac76 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 25 Jun 2019 17:09:27 +0200 Subject: JIT: implemented most ALU instructions --- src/ARM.cpp | 18 +- src/ARMJIT.cpp | 16 +- src/ARMJIT.h | 25 +- src/ARMJIT_RegCache.h | 136 +++++++++ src/ARMJIT_x64/ARMJIT_ALU.cpp | 546 +++++++++++++++++++++++++++++++++++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 245 ++++++++--------- src/ARMJIT_x64/ARMJIT_Compiler.h | 60 +++- src/CMakeLists.txt | 1 + 8 files changed, 881 insertions(+), 166 deletions(-) create mode 100644 src/ARMJIT_RegCache.h create mode 100644 src/ARMJIT_x64/ARMJIT_ALU.cpp (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index b709277..420257a 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -517,10 +517,10 @@ void ARMv5::Execute() AddCycles_C(); }*/ - if (!ARMJIT::IsMapped(Num, R[15] - ((CPSR&0x20)?2:4))) - printf("aaarg ungempappter raum %x\n", R[15]); + /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4))) + printf("aaarg ungempappter raum %x\n", R[15]);*/ - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(Num, R[15] - ((CPSR&0x20)?2:4)); + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); if (block == NULL) block = ARMJIT::CompileBlock(this); Cycles += block(); @@ -572,7 +572,7 @@ void ARMv4::Execute() while (NDS::ARM7Timestamp < NDS::ARM7Target) { - if (CPSR & 0x20) // THUMB + /*if (CPSR & 0x20) // THUMB { // prefetch R[15] += 2; @@ -600,7 +600,15 @@ void ARMv4::Execute() } else AddCycles_C(); - } + }*/ + + /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4))) + printf("aaarg ungempappter raum %x\n", R[15]);*/ + + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); + if (block == NULL) + block = ARMJIT::CompileBlock(this); + Cycles += block(); // TODO optimize this shit!!! if (Halted) diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 489cdcf..74e154b 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,5 +1,7 @@ #include "ARMJIT.h" +#include + #include "ARMJIT_x64/ARMJIT_Compiler.h" namespace ARMJIT @@ -8,7 +10,6 @@ namespace ARMJIT Compiler* compiler; BlockCache cache; - #define DUP2(x) x, x static ptrdiff_t JIT_MEM[2][32] = { @@ -174,4 +175,17 @@ CompiledBlock CompileBlock(ARM* cpu) return block; } +void ResetBlocks() +{ + memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); + memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); + memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); + memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); + memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); + memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); + memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM)); + memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); + memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index d718295..2ca29e8 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -3,8 +3,6 @@ #include "types.h" -#include - #include "ARM.h" #include "ARM_InstrInfo.h" @@ -13,14 +11,6 @@ namespace ARMJIT typedef u32 (*CompiledBlock)(); -class RegCache -{ - -static const int NativeRegAllocOrder[]; -static const int NativeRegsCount; - -}; - struct FetchedInstr { u32 A_Reg(int pos) const @@ -117,24 +107,13 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func) cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; } -inline void ResetBlocks() -{ - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); -} - void Init(); void DeInit(); CompiledBlock CompileBlock(ARM* cpu); +void ResetBlocks(); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h new file mode 100644 index 0000000..e18d50f --- /dev/null +++ b/src/ARMJIT_RegCache.h @@ -0,0 +1,136 @@ +#ifndef ARMJIT_REGCACHE_H +#define ARMJIT_REGCACHE_H + +#include "ARMJIT.h" + +// TODO: replace this in the future +#include "dolphin/BitSet.h" + +#include + +namespace ARMJIT +{ + +template +class RegCache +{ +public: + RegCache() + {} + + RegCache(T* compiler, FetchedInstr instrs[], int instrsCount) + : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) + { + for (int i = 0; i < 16; i++) + Mapping[i] = (Reg)-1; + } + + void UnloadRegister(int reg) + { + assert(Mapping[reg] != -1); + + if (DirtyRegs & (1 << reg)) + Compiler->UnloadReg(reg, Mapping[reg]); + + DirtyRegs &= ~(1 << reg); + LoadedRegs &= ~(1 << reg); + NativeRegsUsed &= ~(1 << (int)Mapping[reg]); + Mapping[reg] = (Reg)-1; + } + + void LoadRegister(int reg) + { + assert(Mapping[reg] == -1); + for (int i = 0; i < NativeRegsAvailable; i++) + { + Reg nativeReg = NativeRegAllocOrder[i]; + if (!(NativeRegsUsed & (1 << nativeReg))) + { + Mapping[reg] = nativeReg; + NativeRegsUsed |= 1 << (int)nativeReg; + LoadedRegs |= 1 << reg; + + Compiler->LoadReg(reg, nativeReg); + + return; + } + } + + assert("Welp!"); + } + + void Flush() + { + BitSet16 loadedSet(LoadedRegs); + for (int reg : loadedSet) + UnloadRegister(reg); + } + + void Prepare(int i) + { + u16 futureNeeded = 0; + int ranking[16]; + for (int j = 0; j < 16; j++) + ranking[j] = 0; + for (int j = i; j < InstrsCount; j++) + { + BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); + futureNeeded |= regsNeeded.m_val; + for (int reg : regsNeeded) + ranking[reg]++; + } + + // we'll unload all registers which are never used again + BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded); + for (int reg : neverNeededAgain) + UnloadRegister(reg); + + FetchedInstr Instr = Instrs[i]; + u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); + if (needToBeLoaded != BitSet16(0)) + { + int neededCount = needToBeLoaded.Count(); + BitSet16 loadedSet(LoadedRegs); + while (loadedSet.Count() + neededCount > NativeRegsAvailable) + { + int leastReg = -1; + int rank = 1000; + for (int reg : loadedSet) + { + if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank) + { + leastReg = reg; + rank = ranking[reg]; + } + } + + assert(leastReg != -1); + UnloadRegister(leastReg); + + loadedSet.m_val = LoadedRegs; + } + + for (int reg : needToBeLoaded) + LoadRegister(reg); + } + DirtyRegs |= Instr.Info.DstRegs; + } + + static const Reg NativeRegAllocOrder[]; + static const int NativeRegsAvailable; + + Reg Mapping[16]; + u32 NativeRegsUsed = 0; + u16 LoadedRegs = 0; + u16 DirtyRegs = 0; + + T* Compiler; + + FetchedInstr* Instrs; + int InstrsCount; +}; + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp new file mode 100644 index 0000000..d06c99c --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -0,0 +1,546 @@ +#include "ARMJIT_Compiler.h" + +using namespace Gen; + +namespace ARMJIT +{ + +// uses RSCRATCH3 +void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&), + OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (rd == rn && !(opFlags & opInvertOp2)) + (this->*op)(32, rd, op2); + else if (opFlags & opSymmetric && op2 == R(RSCRATCH)) + { + if (opFlags & opInvertOp2) + NOT(32, op2); + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + } + else + { + if (opFlags & opInvertOp2) + { + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + NOT(32, op2); + } + MOV(32, R(RSCRATCH3), rn); + (this->*op)(32, R(RSCRATCH3), op2); + MOV(32, rd, R(RSCRATCH3)); + } + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) +{ + switch (op) + { + case 0: // TST + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + TEST(32, rn, op2); + break; + case 1: // TEQ + MOV(32, R(RSCRATCH3), rn); + XOR(32, R(RSCRATCH3), op2); + break; + case 2: // CMP + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + CMP(32, rn, op2); + break; + case 3: // CMN + MOV(32, R(RSCRATCH3), rn); + ADD(32, R(RSCRATCH3), op2); + break; + } + + Comp_RetriveFlags(op == 2, op >= 2, carryUsed); +} + +// also calculates cycles +OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) +{ + if (CurrentInstr.Instr & (1 << 25)) + { + Comp_AddCycles_C(); + carryUsed = false; + return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E)); + } + else + { + int op = (CurrentInstr.Instr >> 5) & 0x3; + if (CurrentInstr.Instr & (1 << 4)) + { + Comp_AddCycles_CI(1); + OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15) + rm = Imm32(rm.Imm32() + 4); + return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed); + } + else + { + Comp_AddCycles_C(); + return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F, + MapReg(CurrentInstr.A_Reg(0)), S, carryUsed); + } + } +} + +void Compiler::A_Comp_CmpOp() +{ + u32 op = (CurrentInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed); + + Comp_CmpOp(op - 0x8, rn, op2, carryUsed); +} + +void Compiler::A_Comp_Arith() +{ + bool S = CurrentInstr.Instr & (1 << 20); + u32 op = (CurrentInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed); + + u32 sFlag = S ? opSetsFlags : 0; + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0x1: // EOR + Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0x2: // SUB + Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + return; + case 0x3: // RSB + if (op2.IsZero()) + { + if (rd != rn) + MOV(32, rd, rn); + NEG(32, rd); + if (S) + Comp_RetriveFlags(true, true, false); + } + else + Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + return; + case 0x4: // ADD + Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); + return; + case 0x5: // ADC + Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); + return; + case 0x6: // SBC + Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + return; + case 0x7: // RSC + Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); + return; + case 0xC: // ORR + Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0xE: // BIC + Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); + return; + default: + assert("unimplemented"); + } +} + +void Compiler::A_Comp_MovOp() +{ + bool carryUsed; + bool S = CurrentInstr.Instr & (1 << 20); + OpArg op2 = A_Comp_GetALUOp2(S, carryUsed); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + + if (rd != op2) + MOV(32, rd, op2); + + if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF) + NOT(32, rd); + + if (S) + { + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); + } +} + +void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) +{ + CPSRDirty = true; + + bool carryOnly = !retriveCV && carryUsed; + if (retriveCV) + { + SETcc(CC_O, R(RSCRATCH)); + SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3)); + LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); + } + + if (carryUsed == 983298) + printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr); + + SETcc(CC_S, R(RSCRATCH)); + SETcc(CC_Z, R(RSCRATCH3)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); + int shiftAmount = 30; + if (retriveCV || carryUsed) + { + LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); + shiftAmount = carryOnly ? 29 : 28; + } + SHL(32, R(RSCRATCH), Imm8(shiftAmount)); + + AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH)); +} + +// always uses RSCRATCH, RSCRATCH2 only if S == true +OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = S; + + if (S) + { + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + BT(32, R(RCPSR), Imm8(29)); + SETcc(CC_C, R(RSCRATCH2)); + } + + MOV(32, R(RSCRATCH), rm); + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rs); + AND(32, R(ECX), Imm32(0xFF)); + + FixupBranch zero = J_CC(CC_Z); + if (op < 3) + { + void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL; + if (op == 0) + shiftOp = SHL; + else if (op == 1) + shiftOp = SHR; + else if (op == 2) + shiftOp = SAR; + + CMP(32, R(ECX), Imm8(32)); + FixupBranch lt32 = J_CC(CC_L); + FixupBranch done1; + if (op < 2) + { + FixupBranch eq32 = J_CC(CC_E); + XOR(32, R(RSCRATCH), R(RSCRATCH)); + if (S) + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + done1 = J(); + SetJumpTarget(eq32); + } + (this->*shiftOp)(32, R(RSCRATCH), Imm8(31)); + (this->*shiftOp)(32, R(RSCRATCH), Imm8(1)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + FixupBranch done2 = J(); + + SetJumpTarget(lt32); + (this->*shiftOp)(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + if (op < 2) + SetJumpTarget(done1); + SetJumpTarget(done2); + + } + else if (op == 3) + { + if (S) + BT(32, R(RSCRATCH), Imm8(31)); + ROR_(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + } + SetJumpTarget(zero); + + return R(RSCRATCH); +} + +// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue +OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = true; + + switch (op) + { + case 0: // LSL + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHL(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + return R(RSCRATCH); + } + else + { + carryUsed = false; + return rm; + } + case 1: // LSR + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHR(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), rm); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), rm); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + BT(32, rm, Imm8(31)); + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + MOV(32, R(RSCRATCH), rm); + if (amount > 0) + ROR_(32, R(RSCRATCH), Imm8(amount)); + else + { + BT(32, R(RCPSR), Imm8(29)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + + assert(false); +} + +void Compiler::T_Comp_ShiftImm() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 11) & 0x3; + int amount = (CurrentInstr.Instr >> 6) & 0x1F; + + Comp_AddCycles_C(); + + bool carryUsed; + OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed); + + if (shifted != rd) + MOV(32, rd, shifted); + + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); +} + +void Compiler::T_Comp_AddSub_() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 9) & 0x3; + + OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6)); + + Comp_AddCycles_C(); + + if (op & 1) + Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); + else + Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); +} + +void Compiler::T_Comp_ALU_Imm8() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(8)); + + u32 op = (CurrentInstr.Instr >> 11) & 0x3; + OpArg imm = Imm32(CurrentInstr.Instr & 0xFF); + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: + MOV(32, rd, imm); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + case 0x1: + Comp_CmpOp(2, rd, imm, false); + return; + case 0x2: + Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + return; + case 0x3: + Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + return; + } +} + +void Compiler::T_Comp_ALU() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + u32 op = (CurrentInstr.Instr >> 6) & 0xF; + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x1: // EOR + Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x2: + case 0x3: + case 0x4: + case 0x7: + { + int shiftOp = op == 7 ? 3 : op - 0x2; + bool carryUsed; + OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); + TEST(32, shifted, shifted); + MOV(32, rd, shifted); + Comp_RetriveFlags(false, false, true); + } + return; + case 0x5: // ADC + Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); + return; + case 0x6: // SBC + Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); + return; + case 0x8: // TST + Comp_CmpOp(0, rd, rs, false); + return; + case 0x9: // NEG + if (rd != rs) + MOV(32, rd, rs); + NEG(32, rd); + Comp_RetriveFlags(true, true, false); + return; + case 0xA: // CMP + Comp_CmpOp(2, rd, rs, false); + return; + case 0xB: // CMN + Comp_CmpOp(3, rd, rs, false); + return; + case 0xC: // ORR + Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0xE: // BIC + Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); + return; + case 0xF: // MVN + if (rd != rs) + MOV(32, rd, rs); + NOT(32, rd); + Comp_RetriveFlags(false, false, false); + return; + default: + break; + } +} + +void Compiler::T_Comp_ALU_HiReg() +{ + OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8))); + OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF); + + u32 op = (CurrentInstr.Instr >> 8) & 0x3; + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // ADD + Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV); + return; + case 0x1: // CMP + Comp_CmpOp(2, rd, rs, false); + return; + case 0x2: // MOV + if (rd != rs) + MOV(32, rd, rs); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + } +} + +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fb2fda8..f51d4d9 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -8,18 +8,16 @@ using namespace Gen; namespace ARMJIT { - -const int RegCache::NativeRegAllocOrder[] = {(int)RBX, (int)RSI, (int)RDI, (int)R12, (int)R13}; -const int RegCache::NativeRegsCount = 5; +template <> +const X64Reg RegCache::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13}; +template <> +const int RegCache::NativeRegsAvailable = 5; Compiler::Compiler() { AllocCodeSpace(1024 * 1024 * 4); } -typedef void (Compiler::*CompileFunc)(); -typedef void (*InterpretFunc)(ARM*); - void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -36,6 +34,19 @@ void Compiler::SaveCPSR() } } +void Compiler::LoadReg(int reg, X64Reg nativeReg) +{ + if (reg != 15) + MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg]))); + else + MOV(32, R(nativeReg), Imm32(R15)); +} + +void Compiler::UnloadReg(int reg, X64Reg nativeReg) +{ + MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); +} + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) @@ -58,12 +69,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); + // TODO: this is ugly as a whole, do better + RegCache = ARMJIT::RegCache(this, instrs, instrsCount); + for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; CurrentInstr = instrs[i]; - CompileFunc comp = NULL; + CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind); + + if (CurrentInstr.Info.Branches()) + comp = NULL; if (comp == NULL || i == instrsCount - 1) { @@ -79,6 +96,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs SaveCPSR(); } + if (comp != NULL) + RegCache.Prepare(i); + else + RegCache.Flush(); + if (Thumb) { if (comp == NULL) @@ -89,8 +111,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); } else - { - } + (this->*comp)(); } else { @@ -101,7 +122,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_CallFunction(ARMInterpreter::A_BLX_IMM); } else if (cond == 0xF) - AddCycles_C(); + Comp_AddCycles_C(); else { FixupBranch skipExecute; @@ -115,17 +136,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, R(RSCRATCH), Imm32(1)); SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - + skipExecute = J_CC(CC_Z); } else { // could have used a LUT, but then where would be the fun? BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))); - + skipExecute = J_CC(cond & 1 ? CC_C : CC_NC); } - + } if (comp == NULL) @@ -136,8 +157,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); } else - { - } + (this->*comp)(); FixupBranch skipFailed; if (CurrentInstr.Cond() < 0xE) @@ -145,7 +165,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs skipFailed = J(); SetJumpTarget(skipExecute); - AddCycles_C(); + Comp_AddCycles_C(); SetJumpTarget(skipFailed); } @@ -155,13 +175,14 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs /* we don't need to collect the interpreted cycles, since all functions only add to it, the dispatcher - can take care of it. + takes care of it. */ if (comp == NULL && i != instrsCount - 1) LoadCPSR(); } + RegCache.Flush(); SaveCPSR(); LEA(32, RAX, MDisp(RCycles, ConstantCycles)); @@ -172,42 +193,57 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs return res; } -void Compiler::Compile(RegCache& regs, const FetchedInstr& instr) +CompileFunc Compiler::GetCompFunc(int kind) { + // this might look like waste of space, so many repeatitions, but it's invaluable for debugging. + // see ARMInstrInfo.h for the order const CompileFunc A_Comp[ARMInstrInfo::ak_Count] = { - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // AND + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // EOR + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // SUB + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // RSB + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // ADD + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // ADC + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // SBC + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // RSC + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // ORR + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // MOV + A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, + A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, + // BIC + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, + // MVN + A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, + A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, + // TST + A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // TEQ + A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // CMP + A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // CMN + A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -227,21 +263,34 @@ void Compiler::Compile(RegCache& regs, const FetchedInstr& instr) }; const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, + // Shift imm + T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm, + // Three operand ADD/SUB + T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, + // 8 bit imm + T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, + // general ALU + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU, + // hi reg + T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, + // pc/sp relative + NULL, NULL, NULL, + // mem... + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + + return Thumb ? T_Comp[kind] : A_Comp[kind]; } -void Compiler::AddCycles_C() +void Compiler::Comp_AddCycles_C() { s32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3] @@ -253,80 +302,16 @@ void Compiler::AddCycles_C() ConstantCycles += cycles; } -// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue -OpArg Compiler::Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed) -{ - carryUsed = true; - - switch (op) - { - case 0: // LSL - if (amount > 0) - { - MOV(32, R(RSCRATCH), R(rm)); - SHL(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - - return R(RSCRATCH); - } - else - { - carryUsed = false; - return R(rm); - } - case 1: // LSR - if (amount > 0) - { - MOV(32, R(RSCRATCH), R(rm)); - SHR(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - return R(RSCRATCH); - } - else - { - if (S) - { - MOV(32, R(RSCRATCH2), R(rm)); - SHR(32, R(RSCRATCH2), Imm8(31)); - } - return Imm32(0); - } - case 2: // ASR - MOV(32, R(RSCRATCH), R(rm)); - SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); - if (S) - { - if (amount == 0) - { - MOV(32, R(RSCRATCH2), R(rm)); - SHR(32, R(RSCRATCH2), Imm8(31)); - } - else - SETcc(CC_C, R(RSCRATCH2)); - } - return R(RSCRATCH); - case 3: // ROR - if (amount > 0) - { - MOV(32, R(RSCRATCH), R(rm)); - ROR_(32, R(RSCRATCH), Imm8(amount)); - } - else - { - BT(32, R(RCPSR), Imm8(29)); - MOV(32, R(RSCRATCH), R(rm)); - RCR(32, R(RSCRATCH), Imm8(1)); - } - if (S) - SETcc(CC_C, R(RSCRATCH2)); - return R(RSCRATCH); - } -} - -void Compiler::A_Comp_ALU(const FetchedInstr& instr) +void Compiler::Comp_AddCycles_CI(u32 i) { + s32 cycles = (Num ? + NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i; + + if (CurrentInstr.Cond() < 0xE) + ADD(32, R(RCycles), Imm8(cycles)); + else + ConstantCycles += cycles; } } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 8e1d100..9b454f4 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,7 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" - +#include "../ARMJIT_RegCache.h" namespace ARMJIT { @@ -17,6 +17,10 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +class Compiler; + +typedef void (Compiler::*CompileFunc)(); + class Compiler : public Gen::X64CodeBlock { public: @@ -24,24 +28,66 @@ public: CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); - void StartBlock(ARM* cpu); - CompiledBlock FinaliseBlock(); + void LoadReg(int reg, Gen::X64Reg nativeReg); + void UnloadReg(int reg, Gen::X64Reg nativeReg); - void Compile(RegCache& regs, const FetchedInstr& instr); private: - void AddCycles_C(); + CompileFunc GetCompFunc(int kind); + + void Comp_AddCycles_C(); + void Comp_AddCycles_CI(u32 i); + + enum + { + opSetsFlags = 1 << 0, + opSymmetric = 1 << 1, + opRetriveCV = 1 << 2, + opInvertCarry = 1 << 3, + opSyncCarry = 1 << 4, + opInvertOp2 = 1 << 5, + }; + + void A_Comp_Arith(); + void A_Comp_MovOp(); + void A_Comp_CmpOp(); - Gen::OpArg Comp_ShiftRegImm(int op, int amount, Gen::X64Reg rm, bool S, bool& carryUsed); + void T_Comp_ShiftImm(); + void T_Comp_AddSub_(); + void T_Comp_ALU_Imm8(); + void T_Comp_ALU(); + void T_Comp_ALU_HiReg(); - void A_Comp_ALU(const FetchedInstr& instr); + void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); + void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); + void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed); + + void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); + Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); + + Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); void LoadCPSR(); void SaveCPSR(); + Gen::OpArg MapReg(int reg) + { + if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG) + return Gen::Imm32(R15); + + assert(RegCache.Mapping[reg] != Gen::INVALID_REG); + return Gen::R(RegCache.Mapping[reg]); + } + bool CPSRDirty = false; FetchedInstr CurrentInstr; + RegCache RegCache; + bool Thumb; u32 Num; u32 R15; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 87200ad..d88638a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -33,6 +33,7 @@ add_library(core STATIC ARMJIT.cpp ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp dolphin/CommonFuncs.cpp dolphin/x64ABI.cpp -- cgit v1.2.3 From ff901141e77ad6c8d2910d77bef2b7c5674fcc7f Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 25 Jun 2019 18:28:01 +0200 Subject: jit: correct cycle counting for thumb shift by reg --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 7 +++++-- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 0 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_LoadStore.cpp (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index d06c99c..dc82af7 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -456,7 +456,10 @@ void Compiler::T_Comp_ALU() u32 op = (CurrentInstr.Instr >> 6) & 0xF; - Comp_AddCycles_C(); + if ((op >= 0x2 && op < 0x4) || op == 0x7) + Comp_AddCycles_CI(1); + else + Comp_AddCycles_C(); switch (op) { @@ -471,7 +474,7 @@ void Compiler::T_Comp_ALU() case 0x4: case 0x7: { - int shiftOp = op == 7 ? 3 : op - 0x2; + int shiftOp = op == 0x7 ? 3 : op - 0x2; bool carryUsed; OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); TEST(32, shifted, shifted); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3 From 5f932cdf48681414465512fb47d619ad73414137 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 30 Jun 2019 13:35:03 +0200 Subject: JIT: compilation of word load and store --- src/ARMJIT.cpp | 4 +- src/ARMJIT.h | 3 +- src/ARMJIT_RegCache.h | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 111 +++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 600 ++++++++++++++++++++++++++++++++++++ src/ARM_InstrInfo.h | 8 +- src/CMakeLists.txt | 1 + src/dolphin/x64ABI.h | 3 +- 10 files changed, 712 insertions(+), 43 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 74e154b..4da781c 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -40,8 +40,7 @@ static ptrdiff_t JIT_MEM[2][32] = { /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), /* 3X*/ offsetof(BlockCache, SWRAM), offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ -1, - offsetof(BlockCache, ARM7_WIRAM), + /* 4X*/ DUP2(-1), /* 5X*/ DUP2(-1), /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ @@ -183,7 +182,6 @@ void ResetBlocks() memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM)); memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 2ca29e8..45bb4ed 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -63,14 +63,13 @@ struct BlockCache { CompiledBlock* AddrMapping[2][0x4000] = {0}; - CompiledBlock MainRAM[16*1024*1024/2]; + CompiledBlock MainRAM[4*1024*1024/2]; CompiledBlock SWRAM[0x8000/2]; // Shared working RAM CompiledBlock ARM9_ITCM[0x8000/2]; CompiledBlock ARM9_LCDC[0xA4000/2]; CompiledBlock ARM9_BIOS[0x8000/2]; CompiledBlock ARM7_BIOS[0x4000/2]; CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM }; diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h index e18d50f..ea9fb30 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegCache.h @@ -30,7 +30,7 @@ public: assert(Mapping[reg] != -1); if (DirtyRegs & (1 << reg)) - Compiler->UnloadReg(reg, Mapping[reg]); + Compiler->SaveReg(reg, Mapping[reg]); DirtyRegs &= ~(1 << reg); LoadedRegs &= ~(1 << reg); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index dc82af7..6294e1d 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -255,8 +255,8 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b if (S) { XOR(32, R(RSCRATCH2), R(RSCRATCH2)); - BT(32, R(RCPSR), Imm8(29)); - SETcc(CC_C, R(RSCRATCH2)); + TEST(32, R(RCPSR), Imm32(1 << 29)); + SETcc(CC_NZ, R(RSCRATCH2)); } MOV(32, R(RSCRATCH), rm); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index f51d4d9..9096397 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,13 +9,43 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13}; +const X64Reg RegCache::NativeRegAllocOrder[] = +{ +#ifdef _WIN32 + RBX, RSI, RDI, R12, R13 +#else + RBX, R12, R13 +#endif +}; template <> -const int RegCache::NativeRegsAvailable = 5; +const int RegCache::NativeRegsAvailable = +#ifdef _WIN32 + 5 +#else + 3 +#endif +; Compiler::Compiler() { - AllocCodeSpace(1024 * 1024 * 4); + AllocCodeSpace(1024 * 1024 * 16); + + for (int i = 0; i < 15; i++) + { + ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i); + WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i); + for (int j = 0; j < 2; j++) + { + ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i); + WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i); + } + } + ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000); + WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000); + ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000); + WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000); + + ResetStart = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -42,7 +72,7 @@ void Compiler::LoadReg(int reg, X64Reg nativeReg) MOV(32, R(nativeReg), Imm32(R15)); } -void Compiler::UnloadReg(int reg, X64Reg nativeReg) +void Compiler::SaveReg(int reg, X64Reg nativeReg) { MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); } @@ -52,7 +82,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (IsAlmostFull()) { ResetBlocks(); - ResetCodePtr(); + SetCodePtr((u8*)ResetStart); } CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); @@ -61,8 +91,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Thumb = cpu->CPSR & 0x20; Num = cpu->Num; R15 = cpu->R[15]; + CodeRegion = cpu->CodeRegion; - ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); XOR(32, R(RCycles), R(RCycles)); @@ -142,9 +173,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs else { // could have used a LUT, but then where would be the fun? - BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))); + TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - skipExecute = J_CC(cond & 1 ? CC_C : CC_NC); + skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z); } } @@ -187,7 +218,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LEA(32, RAX, MDisp(RCycles, ConstantCycles)); - ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); RET(); return res; @@ -243,23 +274,38 @@ CompileFunc Compiler::GetCompFunc(int kind) A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // Mul + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + NULL, NULL, NULL, NULL, NULL, + // STR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // STRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // LDRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // STRH + NULL, NULL, NULL, NULL, + // LDRD + NULL, NULL, NULL, NULL, + // STRD + NULL, NULL, NULL, NULL, + // LDRH + NULL, NULL, NULL, NULL, + // LDRSB NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRSH + NULL, NULL, NULL, NULL, + // swap + NULL, NULL, + // LDM/STM + NULL, NULL, + // Branch + NULL, NULL, NULL, NULL, NULL, + // system stuff + NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { @@ -278,10 +324,17 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative NULL, NULL, NULL, - // mem... - NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, + // LDR pcrel + NULL, + // LDR/STR reg offset + T_Comp_MemReg, NULL, T_Comp_MemReg, NULL, + // LDR/STR sign extended, half + NULL, NULL, NULL, NULL, + // LDR/STR imm offset + T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, + // LDR/STR half imm offset + NULL, NULL, + // branch, etc. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9b454f4..7ab9b25 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -29,7 +29,7 @@ public: CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); - void UnloadReg(int reg, Gen::X64Reg nativeReg); + void SaveReg(int reg, Gen::X64Reg nativeReg); private: CompileFunc GetCompFunc(int kind); @@ -51,12 +51,17 @@ private: void A_Comp_MovOp(); void A_Comp_CmpOp(); + void A_Comp_MemWB(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), @@ -65,10 +70,14 @@ private: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void* Gen_MemoryRoutine9(bool store, int size, u32 region); + void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region); + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); + Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); void SaveCPSR(); @@ -82,6 +91,8 @@ private: return Gen::R(RegCache.Mapping[reg]); } + void* ResetStart; + bool CPSRDirty = false; FetchedInstr CurrentInstr; @@ -91,10 +102,16 @@ private: bool Thumb; u32 Num; u32 R15; + u32 CodeRegion; u32 ConstantCycles; }; +extern void* ReadMemFuncs9[16]; +extern void* ReadMemFuncs7[2][16]; +extern void* WriteMemFuncs9[16]; +extern void* WriteMemFuncs7[2][16]; + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index e69de29..d534269 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -0,0 +1,600 @@ +#include "ARMJIT_Compiler.h" + +#include "../GPU.h" +#include "../Wifi.h" + +namespace NDS +{ +#define MAIN_RAM_SIZE 0x400000 +extern u8* SWRAM_ARM9; +extern u32 SWRAM_ARM9Mask; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM7Mask; +extern u8 ARM7WRAM[]; +extern u16 ARM7BIOSProt; +} + +using namespace Gen; + +namespace ARMJIT +{ + +void* ReadMemFuncs9[16]; +void* ReadMemFuncs7[2][16]; +void* WriteMemFuncs9[16]; +void* WriteMemFuncs7[2][16]; + +template +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} + +u32 ReadVRAM9(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } +} + +void WriteVRAM9(u32 addr, u32 val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} + +/* + R11 - data to write (store only) + RSCRATCH2 - address + RSCRATCH3 - code cycles +*/ +void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region) +{ + AlignCode4(); + void* res = (void*)GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // cycle counting! + // this is AddCycles_CDI + MOV(32, R(R10), R(RSCRATCH2)); + SHR(32, R(R10), Imm8(12)); + MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6)); + CMP(32, R(R10), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(R10), CC_G); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G); + ADD(32, R(RCycles), R(RSCRATCH3)); + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + { + MOV(32, R(RSCRATCH3), R(RSCRATCH2)); + SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch outsideDTCM = J_CC(CC_AE); + AND(32, R(RSCRATCH2), Imm32(0x3FFF)); + if (!store) + { + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11)); + RET(); + SetJumpTarget(outsideDTCM); + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + } + + switch (region) + { + case 0x00000000: + case 0x01000000: + { + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + RET(); + SetJumpTarget(insideITCM); + AND(32, R(RSCRATCH2), Imm32(0x7FFF)); + if (!store) + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM))); + else + { + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0)); + } + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9)); + TEST(64, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask)); + if (!store) + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3)); + else + { + MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + SetJumpTarget(notMapped); + } + break; + case 0x04000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9IORead32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9IOWrite32, true); + } + break; + case 0x05000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11)); + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(ReadVRAM9); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)WriteVRAM9, true); + } + break; + case 0x07000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11)); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + case 0xFF000000: + if (!store) + { + AND(32, R(RSCRATCH2), Imm32(0xFFF)); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS))); + } + break; + default: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9Write32, true); + } + break; + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region) +{ + AlignCode4(); + void* res = GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // AddCycles_CDI + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(15)); + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2))); + if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode)) + { + if (!store && region != 0x02000000) + LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1)); + ADD(32, R(RCycles), R(RSCRATCH3)); + } + else + { + if (!store) + ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1)); + LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3)); + CMP(32, R(RSCRATCH3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G); + CMP(32, R(R10), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(R10), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + } + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + switch (region) + { + case 0x00000000: + if (!store) { + CMP(32, R(RSCRATCH2), Imm32(0x4000)); + FixupBranch outsideBIOS1 = J_CC(CC_AE); + + MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15]))); + CMP(32, R(RSCRATCH), Imm32(0x4000)); + FixupBranch outsideBIOS2 = J_CC(CC_AE); + MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt)); + CMP(32, R(RSCRATCH2), R(RSCRATCH3)); + FixupBranch notDenied1 = J_CC(CC_AE); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + FixupBranch notDenied2 = J_CC(CC_B); + SetJumpTarget(outsideBIOS2); + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + RET(); + + SetJumpTarget(notDenied1); + SetJumpTarget(notDenied2); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + + SetJumpTarget(outsideBIOS1); + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7)); + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask)); + if (!store) + { + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2)); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + { + MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + RET(); + SetJumpTarget(region); + SetJumpTarget(notMapped); + AND(32, R(RSCRATCH2), Imm32(0xFFFF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0)); + } + } + break; + case 0x04000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(NDS::ARM7IORead32); + ABI_PopRegistersAndAdjustStack({}, 8); + + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM7IOWrite32, true); + } + SetJumpTarget(region); + + if (!store) + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8); + + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({EAX}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + MOV(32, R(RSCRATCH2), R(EAX)); + SHL(32, R(RSCRATCH2), Imm8(16)); + ABI_PopRegistersAndAdjustStack({EAX}, 8); + OR(32, R(EAX), R(RSCRATCH2)); + } + else + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + SHR(32, R(R11), Imm8(16)); + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + } + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(GPU::ReadVRAM_ARM7); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0)); + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)GPU::WriteVRAM_ARM7, true); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + /*default: + ABI_PushRegistersAndAdjustStack({}, 8, 0); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(NDS::ARM7Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + break;*/ + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +OpArg Compiler::A_Comp_GetMemWBOffset() +{ + if (!(CurrentInstr.Instr & (1 << 25))) + return Imm32(CurrentInstr.Instr & 0xFFF); + else + { + int op = (CurrentInstr.Instr >> 5) & 0x3; + int amount = (CurrentInstr.Instr >> 7) & 0x1F; + OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + bool carryUsed; + return Comp_RegShiftImm(op, amount, rm, false, carryUsed); + } +} + +void Compiler::A_Comp_MemWB() +{ + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + bool load = CurrentInstr.Instr & (1 << 20); + + MOV(32, R(RSCRATCH2), rn); + if (CurrentInstr.Instr & (1 << 24)) + { + OpArg offset = A_Comp_GetMemWBOffset(); + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, R(RSCRATCH2), offset); + else + SUB(32, R(RSCRATCH2), offset); + + if (CurrentInstr.Instr & (1 << 21)) + MOV(32, rn, R(RSCRATCH2)); + } + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles; + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, R(RSCRATCH2), R(RSCRATCH)); + + if (!(CurrentInstr.Instr & (1 << 24))) + { + OpArg offset = A_Comp_GetMemWBOffset(); + + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, rn, offset); + else + SUB(32, rn, offset); + } + + if (load) + MOV(32, rd, R(RSCRATCH2)); +} + +void Compiler::T_Comp_MemReg() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + OpArg ro = MapReg(CurrentInstr.T_Reg(6)); + + int op = (CurrentInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + + MOV(32, R(RSCRATCH2), rb); + ADD(32, R(RSCRATCH2), ro); + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::T_Comp_MemImm() +{ + // TODO: aufräumen!!! + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 11) & 0x3; + u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4; + bool load = op & 0x1; + + LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset)); + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +} \ No newline at end of file diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index e717664..dcd938b 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -83,10 +83,10 @@ enum ak_ALU(BIC), ak_ALU(MVN), - ak_ALU(TST), - ak_ALU(TEQ), - ak_ALU(CMP), - ak_ALU(CMN), + ak_Test(TST), + ak_Test(TEQ), + ak_Test(CMP), + ak_Test(CMN), ak_MUL, ak_MLA, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d88638a..662ed5c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,7 @@ add_library(core STATIC ARMJIT.cpp ARMJIT_x64/ARMJIT_Compiler.cpp ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp dolphin/CommonFuncs.cpp dolphin/x64ABI.cpp diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h index 997782e..94336d0 100644 --- a/src/dolphin/x64ABI.h +++ b/src/dolphin/x64ABI.h @@ -37,7 +37,8 @@ // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. #define ABI_ALL_CALLER_SAVED \ - (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11}) + (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16, \ + XMM4 + 16, XMM5 + 16}) #else // 64-bit Unix / OS X #define ABI_PARAM1 RDI -- cgit v1.2.3 From 2c44bf927c230efbbd1b27920de062ddcc631fcf Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 6 Jul 2019 01:48:42 +0200 Subject: JIT: most mem instructions working + branching --- src/ARM.cpp | 10 +- src/ARMJIT.cpp | 7 +- src/ARMJIT.h | 2 +- src/ARMJIT_RegCache.h | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 322 ++++++++------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 145 ++++--- src/ARMJIT_x64/ARMJIT_Compiler.h | 42 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 805 +++++++++++++++--------------------- src/ARM_InstrInfo.cpp | 2 +- src/NDS.cpp | 2 + 10 files changed, 653 insertions(+), 686 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 420257a..f7ca26d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -522,8 +522,9 @@ void ARMv5::Execute() ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); if (block == NULL) - block = ARMJIT::CompileBlock(this); - Cycles += block(); + ARMJIT::CompileBlock(this); + else + Cycles += block(); // TODO optimize this shit!!! if (Halted) @@ -607,8 +608,9 @@ void ARMv4::Execute() ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); if (block == NULL) - block = ARMJIT::CompileBlock(this); - Cycles += block(); + ARMJIT::CompileBlock(this); + else + Cycles += block(); // TODO optimize this shit!!! if (Halted) diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 4da781c..6afa967 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -121,12 +121,13 @@ void DeInit() delete compiler; } -CompiledBlock CompileBlock(ARM* cpu) +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; FetchedInstr instrs[12]; int i = 0; + u32 r15Initial = cpu->R[15]; u32 r15 = cpu->R[15]; u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; //printf("block %x %d\n", r15, thumb); @@ -169,9 +170,7 @@ CompiledBlock CompileBlock(ARM* cpu) CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); - InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block); - - return block; + InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block); } void ResetBlocks() diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 45bb4ed..71188f9 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func) void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void CompileBlock(ARM* cpu); void ResetBlocks(); diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h index ea9fb30..556d27b 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegCache.h @@ -114,7 +114,7 @@ public: for (int reg : needToBeLoaded) LoadRegister(reg); } - DirtyRegs |= Instr.Info.DstRegs; + DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 6294e1d..c22751e 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -71,30 +71,30 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) { switch (op) { - case 0: // TST - if (rn.IsImm()) - { - MOV(32, R(RSCRATCH3), rn); - rn = R(RSCRATCH3); - } - TEST(32, rn, op2); - break; - case 1: // TEQ + case 0: // TST + if (rn.IsImm()) + { MOV(32, R(RSCRATCH3), rn); - XOR(32, R(RSCRATCH3), op2); - break; - case 2: // CMP - if (rn.IsImm()) - { - MOV(32, R(RSCRATCH3), rn); - rn = R(RSCRATCH3); - } - CMP(32, rn, op2); - break; - case 3: // CMN + rn = R(RSCRATCH3); + } + TEST(32, rn, op2); + break; + case 1: // TEQ + MOV(32, R(RSCRATCH3), rn); + XOR(32, R(RSCRATCH3), op2); + break; + case 2: // CMP + if (rn.IsImm()) + { MOV(32, R(RSCRATCH3), rn); - ADD(32, R(RSCRATCH3), op2); - break; + rn = R(RSCRATCH3); + } + CMP(32, rn, op2); + break; + case 3: // CMN + MOV(32, R(RSCRATCH3), rn); + ADD(32, R(RSCRATCH3), op2); + break; } Comp_RetriveFlags(op == 2, op >= 2, carryUsed); @@ -103,38 +103,38 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) // also calculates cycles OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) { - if (CurrentInstr.Instr & (1 << 25)) + if (CurInstr.Instr & (1 << 25)) { Comp_AddCycles_C(); carryUsed = false; - return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E)); + return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E)); } else { - int op = (CurrentInstr.Instr >> 5) & 0x3; - if (CurrentInstr.Instr & (1 << 4)) + int op = (CurInstr.Instr >> 5) & 0x3; + if (CurInstr.Instr & (1 << 4)) { Comp_AddCycles_CI(1); - OpArg rm = MapReg(CurrentInstr.A_Reg(0)); - if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15) + OpArg rm = MapReg(CurInstr.A_Reg(0)); + if (rm.IsImm() && CurInstr.A_Reg(0) == 15) rm = Imm32(rm.Imm32() + 4); - return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed); + return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed); } else { Comp_AddCycles_C(); - return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F, - MapReg(CurrentInstr.A_Reg(0)), S, carryUsed); + return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F, + MapReg(CurInstr.A_Reg(0)), S, carryUsed); } } } void Compiler::A_Comp_CmpOp() { - u32 op = (CurrentInstr.Instr >> 21) & 0xF; + u32 op = (CurInstr.Instr >> 21) & 0xF; bool carryUsed; - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rn = MapReg(CurInstr.A_Reg(16)); OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed); Comp_CmpOp(op - 0x8, rn, op2, carryUsed); @@ -142,12 +142,12 @@ void Compiler::A_Comp_CmpOp() void Compiler::A_Comp_Arith() { - bool S = CurrentInstr.Instr & (1 << 20); - u32 op = (CurrentInstr.Instr >> 21) & 0xF; + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; bool carryUsed; - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed); u32 sFlag = S ? opSetsFlags : 0; @@ -155,13 +155,13 @@ void Compiler::A_Comp_Arith() { case 0x0: // AND Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0x1: // EOR Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0x2: // SUB Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); - return; + break; case 0x3: // RSB if (op2.IsZero()) { @@ -173,41 +173,44 @@ void Compiler::A_Comp_Arith() } else Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); - return; + break; case 0x4: // ADD Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); - return; + break; case 0x5: // ADC Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); - return; + break; case 0x6: // SBC Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); - return; + break; case 0x7: // RSC Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); - return; + break; case 0xC: // ORR Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0xE: // BIC Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); - return; + break; default: assert("unimplemented"); } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); } void Compiler::A_Comp_MovOp() { bool carryUsed; - bool S = CurrentInstr.Instr & (1 << 20); + bool S = CurInstr.Instr & (1 << 20); OpArg op2 = A_Comp_GetALUOp2(S, carryUsed); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); if (rd != op2) MOV(32, rd, op2); - if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF) + if (((CurInstr.Instr >> 21) & 0xF) == 0xF) NOT(32, rd); if (S) @@ -215,6 +218,9 @@ void Compiler::A_Comp_MovOp() TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); } void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) @@ -230,7 +236,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) } if (carryUsed == 983298) - printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr); + printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr); SETcc(CC_S, R(RSCRATCH)); SETcc(CC_Z, R(RSCRATCH3)); @@ -324,61 +330,61 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car switch (op) { - case 0: // LSL - if (amount > 0) - { - MOV(32, R(RSCRATCH), rm); - SHL(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - - return R(RSCRATCH); - } - else - { - carryUsed = false; - return rm; - } - case 1: // LSR - if (amount > 0) - { - MOV(32, R(RSCRATCH), rm); - SHR(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - return R(RSCRATCH); - } - else - { - if (S) - { - MOV(32, R(RSCRATCH2), rm); - SHR(32, R(RSCRATCH2), Imm8(31)); - } - return Imm32(0); - } - case 2: // ASR + case 0: // LSL + if (amount > 0) + { MOV(32, R(RSCRATCH), rm); - SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + SHL(32, R(RSCRATCH), Imm8(amount)); if (S) - { - if (amount == 0) - BT(32, rm, Imm8(31)); SETcc(CC_C, R(RSCRATCH2)); - } + return R(RSCRATCH); - case 3: // ROR + } + else + { + carryUsed = false; + return rm; + } + case 1: // LSR + if (amount > 0) + { MOV(32, R(RSCRATCH), rm); - if (amount > 0) - ROR_(32, R(RSCRATCH), Imm8(amount)); - else - { - BT(32, R(RCPSR), Imm8(29)); - RCR(32, R(RSCRATCH), Imm8(1)); - } + SHR(32, R(RSCRATCH), Imm8(amount)); if (S) SETcc(CC_C, R(RSCRATCH2)); return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), rm); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), rm); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + BT(32, rm, Imm8(31)); + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + MOV(32, R(RSCRATCH), rm); + if (amount > 0) + ROR_(32, R(RSCRATCH), Imm8(amount)); + else + { + BT(32, R(RCPSR), Imm8(29)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); } assert(false); @@ -386,11 +392,11 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car void Compiler::T_Comp_ShiftImm() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - int op = (CurrentInstr.Instr >> 11) & 0x3; - int amount = (CurrentInstr.Instr >> 6) & 0x1F; + int op = (CurInstr.Instr >> 11) & 0x3; + int amount = (CurInstr.Instr >> 6) & 0x1F; Comp_AddCycles_C(); @@ -406,12 +412,12 @@ void Compiler::T_Comp_ShiftImm() void Compiler::T_Comp_AddSub_() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - int op = (CurrentInstr.Instr >> 9) & 0x3; + int op = (CurInstr.Instr >> 9) & 0x3; - OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6)); + OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6)); Comp_AddCycles_C(); @@ -423,38 +429,38 @@ void Compiler::T_Comp_AddSub_() void Compiler::T_Comp_ALU_Imm8() { - OpArg rd = MapReg(CurrentInstr.T_Reg(8)); + OpArg rd = MapReg(CurInstr.T_Reg(8)); - u32 op = (CurrentInstr.Instr >> 11) & 0x3; - OpArg imm = Imm32(CurrentInstr.Instr & 0xFF); + u32 op = (CurInstr.Instr >> 11) & 0x3; + OpArg imm = Imm32(CurInstr.Instr & 0xFF); Comp_AddCycles_C(); switch (op) { - case 0x0: - MOV(32, rd, imm); - TEST(32, rd, rd); - Comp_RetriveFlags(false, false, false); - return; - case 0x1: - Comp_CmpOp(2, rd, imm, false); - return; - case 0x2: - Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); - return; - case 0x3: - Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); - return; + case 0x0: + MOV(32, rd, imm); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + case 0x1: + Comp_CmpOp(2, rd, imm, false); + return; + case 0x2: + Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + return; + case 0x3: + Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + return; } } void Compiler::T_Comp_ALU() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - u32 op = (CurrentInstr.Instr >> 6) & 0xF; + u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) Comp_AddCycles_CI(1); @@ -522,28 +528,62 @@ void Compiler::T_Comp_ALU() void Compiler::T_Comp_ALU_HiReg() { - OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8))); - OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF); + u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8)); + OpArg rdMapped = MapReg(rd); + OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF); - u32 op = (CurrentInstr.Instr >> 8) & 0x3; + u32 op = (CurInstr.Instr >> 8) & 0x3; Comp_AddCycles_C(); switch (op) { - case 0x0: // ADD - Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV); - return; - case 0x1: // CMP - Comp_CmpOp(2, rd, rs, false); - return; - case 0x2: // MOV - if (rd != rs) - MOV(32, rd, rs); - TEST(32, rd, rd); - Comp_RetriveFlags(false, false, false); - return; + case 0x0: // ADD + Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + break; + case 0x1: // CMP + Comp_CmpOp(2, rdMapped, rs, false); + return; // this is on purpose + case 0x2: // MOV + if (rdMapped != rs) + MOV(32, rdMapped, rs); + TEST(32, rdMapped, rdMapped); + Comp_RetriveFlags(false, false, false); + break; + } + + if (rd == 15) + { + OR(32, rdMapped, Imm8(1)); + Comp_JumpTo(rdMapped.GetSimpleReg()); } } +void Compiler::T_Comp_AddSP() +{ + Comp_AddCycles_C(); + + OpArg sp = MapReg(13); + OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2); + if (CurInstr.Instr & (1 << 7)) + SUB(32, sp, offset); + else + ADD(32, sp, offset); +} + +void Compiler::T_Comp_RelAddr() +{ + Comp_AddCycles_C(); + + OpArg rd = MapReg(CurInstr.T_Reg(8)); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + if (CurInstr.Instr & (1 << 11)) + { + OpArg sp = MapReg(13); + LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset)); + } + else + MOV(32, rd, Imm32((R15 & ~2) + offset)); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 9096397..b7358a2 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,7 +9,7 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = +const X64Reg RegCache::NativeRegAllocOrder[] = { #ifdef _WIN32 RBX, RSI, RDI, R12, R13 @@ -18,7 +18,7 @@ const X64Reg RegCache::NativeRegAllocOrder[] = #endif }; template <> -const int RegCache::NativeRegsAvailable = +const int RegCache::NativeRegsAvailable = #ifdef _WIN32 5 #else @@ -30,24 +30,33 @@ Compiler::Compiler() { AllocCodeSpace(1024 * 1024 * 16); - for (int i = 0; i < 15; i++) + for (int i = 0; i < 3; i++) { - ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i); - WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i); for (int j = 0; j < 2; j++) { - ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i); - WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i); + MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); + MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); + MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); } } - ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000); - WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000); - ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000); - WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000); ResetStart = GetWritableCodePtr(); } +DataRegion Compiler::ClassifyAddress(u32 addr) +{ + if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase) + return dataRegionDTCM; + switch (addr & 0xFF000000) + { + case 0x02000000: return dataRegionMainRAM; + case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM; + case 0x04000000: return dataRegionIO; + case 0x06000000: return dataRegionVRAM; + } + return dataRegionGeneric; +} + void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -92,6 +101,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Num = cpu->Num; R15 = cpu->R[15]; CodeRegion = cpu->CodeRegion; + CurCPU = cpu; ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); @@ -106,27 +116,32 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; - CurrentInstr = instrs[i]; - - CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind); + CurInstr = instrs[i]; - if (CurrentInstr.Info.Branches()) - comp = NULL; + CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); if (comp == NULL || i == instrsCount - 1) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); if (i == instrsCount - 1) { - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0])); - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1])); } - SaveCPSR(); + if (comp == NULL || CurInstr.Info.Branches()) + SaveCPSR(); } + // run interpreter + cpu->CodeCycles = CurInstr.CodeCycles; + cpu->R[15] = R15; + cpu->CurInstr = CurInstr.Instr; + cpu->NextInstr[0] = CurInstr.NextInstr[0]; + cpu->NextInstr[1] = CurInstr.NextInstr[1]; + if (comp != NULL) RegCache.Prepare(i); else @@ -134,26 +149,33 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { + u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF; ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); } else (this->*comp)(); + + ARMInterpreter::THUMBInstrTable[icode](cpu); } else { - u32 cond = CurrentInstr.Cond(); - if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + u32 cond = CurInstr.Cond(); + if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) { MOV(64, R(ABI_PARAM1), R(RCPU)); ABI_CallFunction(ARMInterpreter::A_BLX_IMM); + + ARMInterpreter::A_BLX_IMM(cpu); } else if (cond == 0xF) + { Comp_AddCycles_C(); + cpu->AddCycles_C(); + } else { FixupBranch skipExecute; @@ -180,18 +202,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } + u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0); ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); } else (this->*comp)(); FixupBranch skipFailed; - if (CurrentInstr.Cond() < 0xE) + if (CurInstr.Cond() < 0xE) { skipFailed = J(); SetJumpTarget(skipExecute); @@ -200,13 +222,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs SetJumpTarget(skipFailed); } + + if (cpu->CheckCondition(cond)) + ARMInterpreter::ARMInstrTable[icode](cpu); + else + cpu->AddCycles_C(); } } /* we don't need to collect the interpreted cycles, - since all functions only add to it, the dispatcher - takes care of it. + since cpu->Cycles is taken into account by the dispatcher. */ if (comp == NULL && i != instrsCount - 1) @@ -277,29 +303,29 @@ CompileFunc Compiler::GetCompFunc(int kind) // Mul NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRB - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDRB - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRD NULL, NULL, NULL, NULL, // STRD NULL, NULL, NULL, NULL, // LDRH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSB - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // swap - NULL, NULL, + NULL, NULL, // LDM/STM NULL, NULL, // Branch @@ -314,26 +340,26 @@ CompileFunc Compiler::GetCompFunc(int kind) // Three operand ADD/SUB T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, // 8 bit imm - T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, + T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, // general ALU - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU, // hi reg T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative - NULL, NULL, NULL, + T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP, // LDR pcrel - NULL, + NULL, // LDR/STR reg offset - T_Comp_MemReg, NULL, T_Comp_MemReg, NULL, - // LDR/STR sign extended, half - NULL, NULL, NULL, NULL, + T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, + // LDR/STR sign extended, half + T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, // LDR/STR imm offset - T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, + T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, // LDR/STR half imm offset - NULL, NULL, + T_Comp_MemImmHalf, T_Comp_MemImmHalf, // branch, etc. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -346,10 +372,10 @@ CompileFunc Compiler::GetCompFunc(int kind) void Compiler::Comp_AddCycles_C() { s32 cycles = Num ? - NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3] - : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles); + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (CurrentInstr.Cond() < 0xE) + if (CurInstr.Cond() < 0xE) ADD(32, R(RCycles), Imm8(cycles)); else ConstantCycles += cycles; @@ -358,13 +384,26 @@ void Compiler::Comp_AddCycles_C() void Compiler::Comp_AddCycles_CI(u32 i) { s32 cycles = (Num ? - NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2] - : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i; - - if (CurrentInstr.Cond() < 0xE) + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; + + if (CurInstr.Cond() < 0xE) ADD(32, R(RCycles), Imm8(cycles)); else ConstantCycles += cycles; } +void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) +{ + SaveCPSR(); + + MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(32, R(ABI_PARAM2), R(addr)); + MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); + if (Num == 0) + CALL((void*)&ARMv5::JumpTo); + else + CALL((void*)&ARMv4::JumpTo); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 7ab9b25..9395a29 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -6,6 +6,8 @@ #include "../ARMJIT.h" #include "../ARMJIT_RegCache.h" +#include + namespace ARMJIT { @@ -21,6 +23,19 @@ class Compiler; typedef void (Compiler::*CompileFunc)(); +enum DataRegion +{ + dataRegionGeneric, // hey, that's me! + dataRegionMainRAM, + dataRegionSWRAM, + dataRegionVRAM, + dataRegionIO, + dataRegionExclusive, + dataRegionsCount, + dataRegionDTCM = dataRegionExclusive, + dataRegionWRAM7 = dataRegionExclusive, +}; + class Compiler : public Gen::X64CodeBlock { public: @@ -34,6 +49,8 @@ public: private: CompileFunc GetCompFunc(int kind); + void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); + void Comp_AddCycles_C(); void Comp_AddCycles_CI(u32 i); @@ -47,11 +64,14 @@ private: opInvertOp2 = 1 << 5, }; + DataRegion ClassifyAddress(u32 addr); + void A_Comp_Arith(); void A_Comp_MovOp(); void A_Comp_CmpOp(); void A_Comp_MemWB(); + void A_Comp_MemHalf(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -59,8 +79,15 @@ private: void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_RelAddr(); + void T_Comp_AddSP(); + void T_Comp_MemReg(); void T_Comp_MemImm(); + void T_Comp_MemRegHalf(); + void T_Comp_MemImmHalf(); + + void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -70,8 +97,8 @@ private: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); - void* Gen_MemoryRoutine9(bool store, int size, u32 region); - void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region); + void* Gen_MemoryRoutine9(bool store, int size); + void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -92,10 +119,12 @@ private: } void* ResetStart; + void* MemoryFuncs9[3][2]; + void* MemoryFuncs7[3][2][2]; bool CPSRDirty = false; - FetchedInstr CurrentInstr; + FetchedInstr CurInstr; RegCache RegCache; @@ -105,12 +134,9 @@ private: u32 CodeRegion; u32 ConstantCycles; -}; -extern void* ReadMemFuncs9[16]; -extern void* ReadMemFuncs7[2][16]; -extern void* WriteMemFuncs9[16]; -extern void* WriteMemFuncs7[2][16]; + ARM* CurCPU; +}; } diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index d534269..69746e2 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -5,7 +5,6 @@ namespace NDS { -#define MAIN_RAM_SIZE 0x400000 extern u8* SWRAM_ARM9; extern u32 SWRAM_ARM9Mask; extern u8* SWRAM_ARM7; @@ -19,11 +18,6 @@ using namespace Gen; namespace ARMJIT { -void* ReadMemFuncs9[16]; -void* ReadMemFuncs7[2][16]; -void* WriteMemFuncs9[16]; -void* WriteMemFuncs7[2][16]; - template int squeezePointer(T* ptr) { @@ -32,569 +26,434 @@ int squeezePointer(T* ptr) return truncated; } -u32 ReadVRAM9(u32 addr) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: return GPU::ReadVRAM_ABG(addr); - case 0x00200000: return GPU::ReadVRAM_BBG(addr); - case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); - case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); - default: return GPU::ReadVRAM_LCDC(addr); - } -} +/* + According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) + of all memory load and store instructions always access addresses in the same region as + during the their first execution. -void WriteVRAM9(u32 addr, u32 val) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; - case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; - case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; - case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; - } -} + I tried multiple optimisations, which would benefit from this behaviour + (having fast paths for the first region, …), though none of them yielded a measureable + improvement. +*/ /* - R11 - data to write (store only) - RSCRATCH2 - address - RSCRATCH3 - code cycles + address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) + store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) + code cycles - ABI_PARAM3 */ -void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region) +void* Compiler::Gen_MemoryRoutine9(bool store, int size) { + u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); AlignCode4(); - void* res = (void*)GetWritableCodePtr(); + void* res = GetWritableCodePtr(); - if (!store) - { - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - AND(32, R(RSCRATCH), Imm8(0x3)); - SHL(32, R(RSCRATCH), Imm8(3)); - // enter the shadow realm! - MOV(32, MDisp(RSP, 8), R(RSCRATCH)); - } + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch insideDTCM = J_CC(CC_B); - // cycle counting! - // this is AddCycles_CDI - MOV(32, R(R10), R(RSCRATCH2)); - SHR(32, R(R10), Imm8(12)); - MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2)); - LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6)); - CMP(32, R(R10), R(RSCRATCH3)); - CMOVcc(32, RSCRATCH3, R(R10), CC_G); - CMP(32, R(RSCRATCH), R(RSCRATCH3)); - CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G); - ADD(32, R(RCycles), R(RSCRATCH3)); - - if (!store) - XOR(32, R(RSCRATCH), R(RSCRATCH)); - AND(32, R(RSCRATCH2), Imm32(~3)); + CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + // cycle counting! + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(12)); + MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0))); + LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6)); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); + CMP(32, R(ABI_PARAM4), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + + if (store) { - MOV(32, R(RSCRATCH3), R(RSCRATCH2)); - SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch outsideDTCM = J_CC(CC_AE); - AND(32, R(RSCRATCH2), Imm32(0x3FFF)); - if (!store) + if (size > 8) + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + switch (size) { - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM))); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + case 32: JMP((u8*)NDS::ARM9Write32, true); break; + case 16: JMP((u8*)NDS::ARM9Write16, true); break; + case 8: JMP((u8*)NDS::ARM9Write8, true); break; } - else - MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11)); - RET(); - SetJumpTarget(outsideDTCM); - MOV(32, R(RSCRATCH2), R(RSCRATCH3)); } - - switch (region) + else { - case 0x00000000: - case 0x01000000: - { - CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - RET(); - SetJumpTarget(insideITCM); - AND(32, R(RSCRATCH2), Imm32(0x7FFF)); - if (!store) - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM))); - else - { - MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0)); - } - } - break; - case 0x02000000: - AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); - } - break; - case 0x03000000: - { - MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9)); - TEST(64, R(RSCRATCH3), R(RSCRATCH3)); - FixupBranch notMapped = J_CC(CC_Z); - AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask)); - if (!store) - MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3)); - else - { - MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); - } - SetJumpTarget(notMapped); - } - break; - case 0x04000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8, 0); - ABI_CallFunction(NDS::ARM9IORead32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM9IOWrite32, true); - } - break; - case 0x05000000: - { - MOV(32, R(RSCRATCH), Imm32(1<<1)); - MOV(32, R(RSCRATCH3), Imm32(1<<9)); - TEST(32, R(RSCRATCH2), Imm32(0x400)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); - TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); - FixupBranch available = J_CC(CC_NZ); - RET(); - SetJumpTarget(available); - AND(32, R(RSCRATCH2), Imm32(0x7FF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette))); - else - MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11)); - } - break; - case 0x06000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(ReadVRAM9); - ABI_PopRegistersAndAdjustStack({}, 8); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)WriteVRAM9, true); - } - break; - case 0x07000000: + if (size == 32) { - MOV(32, R(RSCRATCH), Imm32(1<<1)); - MOV(32, R(RSCRATCH3), Imm32(1<<9)); - TEST(32, R(RSCRATCH2), Imm32(0x400)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); - TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); - FixupBranch available = J_CC(CC_NZ); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + // everything's already in the appropriate register + ABI_CallFunction(NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({ECX}, 8); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); RET(); - SetJumpTarget(available); - AND(32, R(RSCRATCH2), Imm32(0x7FF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM))); - else - MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11)); } - break; - case 0x08000000: - case 0x09000000: - case 0x0A000000: - if (!store) - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - break; - case 0xFF000000: - if (!store) - { - AND(32, R(RSCRATCH2), Imm32(0xFFF)); - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS))); - } - break; - default: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) + else if (size == 16) { - ABI_PushRegistersAndAdjustStack({}, 8, 0); - ABI_CallFunction(NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + JMP((u8*)NDS::ARM9Read16, true); } else + JMP((u8*)NDS::ARM9Read8, true); + } + + SetJumpTarget(insideDTCM); + ADD(32, R(RCycles), R(ABI_PARAM3)); + AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); + if (store) + MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); + else + { + MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); + if (size == 32) { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM9Write32, true); + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); } - break; } + RET(); - if (!store) + SetJumpTarget(insideITCM); + ADD(32, R(RCycles), R(ABI_PARAM3)); + MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX + AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); + if (store) { - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); + if (size == 32) + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + } + else + { + MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM))); + if (size == 32) + { + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } } - RET(); + static_assert(RSCRATCH == EAX); + return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region) +void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) { + u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); AlignCode4(); void* res = GetWritableCodePtr(); - if (!store) - { - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - AND(32, R(RSCRATCH), Imm8(0x3)); - SHL(32, R(RSCRATCH), Imm8(3)); - // enter the shadow realm! - MOV(32, MDisp(RSP, 8), R(RSCRATCH)); - } - - // AddCycles_CDI - MOV(32, R(RSCRATCH), R(RSCRATCH2)); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2))); - if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode)) + MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0xFF000000)); + CMP(32, R(RSCRATCH), Imm32(0x02000000)); + FixupBranch outsideMainRAM = J_CC(CC_NE); + if (codeMainRAM) { - if (!store && region != 0x02000000) - LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1)); - ADD(32, R(RCycles), R(RSCRATCH3)); + LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3)); + ADD(32, R(RCycles), R(RSCRATCH)); } else { if (!store) - ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1)); - LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3)); - CMP(32, R(RSCRATCH3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G); - CMP(32, R(R10), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(R10), CC_G); + ADD(32, R(ABI_PARAM3), Imm8(1)); + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); + CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); + CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); ADD(32, R(RCycles), R(RSCRATCH)); } - - if (!store) + MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); + AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); + if (store) + { + MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); XOR(32, R(RSCRATCH), R(RSCRATCH)); - AND(32, R(RSCRATCH2), Imm32(~3)); + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); + if (size == 32) + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); + } + else + { + MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); + if (size == 32) + { + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + } + RET(); - switch (region) + SetJumpTarget(outsideMainRAM); + if (codeMainRAM) + { + if (!store) + ADD(32, R(ABI_PARAM4), Imm8(1)); + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); + CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); + CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + } + else + { + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1)); + ADD(32, R(RCycles), R(RSCRATCH)); + } + if (store) + { + if (size > 8) + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + switch (size) + { + case 32: JMP((u8*)NDS::ARM7Write32, true); break; + case 16: JMP((u8*)NDS::ARM7Write16, true); break; + case 8: JMP((u8*)NDS::ARM7Write8, true); break; + } + } + else { - case 0x00000000: - if (!store) { - CMP(32, R(RSCRATCH2), Imm32(0x4000)); - FixupBranch outsideBIOS1 = J_CC(CC_AE); - - MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15]))); - CMP(32, R(RSCRATCH), Imm32(0x4000)); - FixupBranch outsideBIOS2 = J_CC(CC_AE); - MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt)); - CMP(32, R(RSCRATCH2), R(RSCRATCH3)); - FixupBranch notDenied1 = J_CC(CC_AE); - CMP(32, R(RSCRATCH), R(RSCRATCH3)); - FixupBranch notDenied2 = J_CC(CC_B); - SetJumpTarget(outsideBIOS2); - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - RET(); - - SetJumpTarget(notDenied1); - SetJumpTarget(notDenied2); - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS))); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - - SetJumpTarget(outsideBIOS1); - } - break; - case 0x02000000: - AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); - } - break; - case 0x03000000: - { - TEST(32, R(RSCRATCH2), Imm32(0x800000)); - FixupBranch region = J_CC(CC_NZ); - MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7)); - TEST(64, R(RSCRATCH), R(RSCRATCH)); - FixupBranch notMapped = J_CC(CC_Z); - AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask)); - if (!store) - { - MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2)); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - else - { - MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); - } - RET(); - SetJumpTarget(region); - SetJumpTarget(notMapped); - AND(32, R(RSCRATCH2), Imm32(0xFFFF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0)); - } - } - break; - case 0x04000000: - { - TEST(32, R(RSCRATCH2), Imm32(0x800000)); - FixupBranch region = J_CC(CC_NZ); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(NDS::ARM7IORead32); - ABI_PopRegistersAndAdjustStack({}, 8); - - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM7IOWrite32, true); - } - SetJumpTarget(region); - - if (!store) - { - ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - ABI_CallFunction(Wifi::Read); - ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8); - - ADD(32, R(RSCRATCH2), Imm8(2)); - ABI_PushRegistersAndAdjustStack({EAX}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - ABI_CallFunction(Wifi::Read); - MOV(32, R(RSCRATCH2), R(EAX)); - SHL(32, R(RSCRATCH2), Imm8(16)); - ABI_PopRegistersAndAdjustStack({EAX}, 8); - OR(32, R(EAX), R(RSCRATCH2)); - } - else - { - ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - MOVZX(32, 16, ABI_PARAM2, R(R11)); - ABI_CallFunction(Wifi::Write); - ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - SHR(32, R(R11), Imm8(16)); - ADD(32, R(RSCRATCH2), Imm8(2)); - ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - MOVZX(32, 16, ABI_PARAM2, R(R11)); - ABI_CallFunction(Wifi::Write); - ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - } - } - break; - case 0x06000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(GPU::ReadVRAM_ARM7); - ABI_PopRegistersAndAdjustStack({}, 8); - } - else - { - AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1)); - MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0)); - MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0)); - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)GPU::WriteVRAM_ARM7, true); - } - break; - case 0x08000000: - case 0x09000000: - case 0x0A000000: - if (!store) - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - break; - /*default: - ABI_PushRegistersAndAdjustStack({}, 8, 0); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (size == 32) + { + ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); - break;*/ + ABI_PopRegistersAndAdjustStack({ECX}, 8); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + } + else if (size == 16) + { + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + JMP((u8*)NDS::ARM7Read16, true); + } + else + JMP((u8*)NDS::ARM7Read8, true); } + return res; +} + +void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size) +{ + if (store) + MOV(32, R(ABI_PARAM2), rd); + u32 cycles = Num + ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM3), Imm32(cycles)); + CALL(Num == 0 + ? MemoryFuncs9[size >> 4][store] + : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + if (!store) { - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + if (signExtend) + MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); } - - RET(); - - return res; } OpArg Compiler::A_Comp_GetMemWBOffset() { - if (!(CurrentInstr.Instr & (1 << 25))) - return Imm32(CurrentInstr.Instr & 0xFFF); + if (!(CurInstr.Instr & (1 << 25))) + { + u32 imm = CurInstr.Instr & 0xFFF; + return Imm32(imm); + } else { - int op = (CurrentInstr.Instr >> 5) & 0x3; - int amount = (CurrentInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + OpArg rm = MapReg(CurInstr.A_Reg(0)); bool carryUsed; + return Comp_RegShiftImm(op, amount, rm, false, carryUsed); } } void Compiler::A_Comp_MemWB() -{ - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); - bool load = CurrentInstr.Instr & (1 << 20); +{ + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + bool load = CurInstr.Instr & (1 << 20); + bool byte = CurInstr.Instr & (1 << 22); + int size = byte ? 8 : 32; - MOV(32, R(RSCRATCH2), rn); - if (CurrentInstr.Instr & (1 << 24)) + if (CurInstr.Instr & (1 << 24)) { OpArg offset = A_Comp_GetMemWBOffset(); - if (CurrentInstr.Instr & (1 << 23)) - ADD(32, R(RSCRATCH2), offset); + if (CurInstr.Instr & (1 << 23)) + MOV_sum(32, ABI_PARAM1, rn, offset); else - SUB(32, R(RSCRATCH2), offset); + { + MOV(32, R(ABI_PARAM1), rn); + SUB(32, R(ABI_PARAM1), offset); + } - if (CurrentInstr.Instr & (1 << 21)) - MOV(32, rn, R(RSCRATCH2)); + if (CurInstr.Instr & (1 << 21)) + MOV(32, rn, R(ABI_PARAM1)); } - - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles; - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; else + MOV(32, R(ABI_PARAM1), rn); + + if (!(CurInstr.Instr & (1 << 24))) + { + OpArg offset = A_Comp_GetMemWBOffset(); + + if (CurInstr.Instr & (1 << 23)) + ADD(32, rn, offset); + else + SUB(32, rn, offset); + } + + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + if (load && CurInstr.A_Reg(12) == 15) + { + if (byte) + printf("!!! LDRB PC %08X\n", R15); + else + { + if (Num == 1) + AND(32, rd, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rd.GetSimpleReg()); + } + } +} + +void Compiler::A_Comp_MemHalf() +{ + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + + OpArg offset = CurInstr.Instr & (1 << 22) + ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : MapReg(CurInstr.A_Reg(0)); + + if (CurInstr.Instr & (1 << 24)) { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); + if (CurInstr.Instr & (1 << 23)) + MOV_sum(32, ABI_PARAM1, rn, offset); + else + { + MOV(32, R(ABI_PARAM1), rn); + SUB(32, R(ABI_PARAM1), offset); + } + + if (CurInstr.Instr & (1 << 21)) + MOV(32, rn, R(ABI_PARAM1)); } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + else + MOV(32, R(ABI_PARAM1), rn); - if (load) - MOV(32, R(RSCRATCH2), R(RSCRATCH)); + int op = (CurInstr.Instr >> 5) & 0x3; + bool load = CurInstr.Instr & (1 << 20); - if (!(CurrentInstr.Instr & (1 << 24))) + bool signExtend = false; + int size; + if (!load && op == 1) + size = 16; + else if (load) { - OpArg offset = A_Comp_GetMemWBOffset(); + size = op == 2 ? 8 : 16; + signExtend = op > 1; + } - if (CurrentInstr.Instr & (1 << 23)) + if (!(CurInstr.Instr & (1 << 24))) + { + if (CurInstr.Instr & (1 << 23)) ADD(32, rn, offset); else SUB(32, rn, offset); } - if (load) - MOV(32, rd, R(RSCRATCH2)); + Comp_MemAccess(rd, signExtend, !load, size); + + if (load && CurInstr.A_Reg(12) == 15) + printf("!!! MemHalf op PC %08X\n", R15);; } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rb = MapReg(CurrentInstr.T_Reg(3)); - OpArg ro = MapReg(CurrentInstr.T_Reg(6)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurrentInstr.Instr >> 10) & 0x3; + int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; - - MOV(32, R(RSCRATCH2), rb); - ADD(32, R(RSCRATCH2), ro); - - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; - else - { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); - } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + bool byte = op & 0x1; + + MOV_sum(32, ABI_PARAM1, rb, ro); - if (load) - MOV(32, rd, R(RSCRATCH)); + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); } void Compiler::T_Comp_MemImm() { - // TODO: aufräumen!!! - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rb = MapReg(CurrentInstr.T_Reg(3)); - - int op = (CurrentInstr.Instr >> 11) & 0x3; - u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4; + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + + int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; + bool byte = op & 0x2; + u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset)); - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; - else - { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); - } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); + + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); +} + +void Compiler::T_Comp_MemRegHalf() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + OpArg ro = MapReg(CurInstr.T_Reg(6)); + + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op != 0; + int size = op != 1 ? 16 : 8; + bool signExtend = op & 1; + + MOV_sum(32, ABI_PARAM1, rb, ro); + + Comp_MemAccess(rd, signExtend, !load, size); +} + +void Compiler::T_Comp_MemImmHalf() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + + u32 offset = (CurInstr.Instr >> 5) & 0x3E; + bool load = CurInstr.Instr & (1 << 11); + + LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - if (load) - MOV(32, rd, R(RSCRATCH)); + Comp_MemAccess(rd, false, !load, 16); } } \ No newline at end of file diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 41c46e1..32a9645 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -317,7 +317,7 @@ Info Decode(bool thumb, u32 num, u32 instr) else { u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)]; - if ((instr & 0xFE000000) == 0xFA000000) + if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; if (data & A_ARM9Only && num != 0) diff --git a/src/NDS.cpp b/src/NDS.cpp index b8fd8cb..baa5e0d 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -524,6 +524,8 @@ void Reset() KeyCnt = 0; RCnt = 0; + ARMJIT::ResetBlocks(); + NDSCart::Reset(); GBACart::Reset(); GPU::Reset(); -- cgit v1.2.3 From ff9721111441e69b4a276a34c757476b625213c6 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Wed, 10 Jul 2019 00:57:59 +0200 Subject: jit: thumb block transfer working also pc and sp relative loads and some refactoring --- src/ARMJIT_RegCache.h | 136 ---------- src/ARMJIT_RegisterCache.h | 136 ++++++++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 82 ++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 515 +++++++++++++++++++++++++++++++----- src/ARM_InstrInfo.cpp | 46 ++-- 6 files changed, 682 insertions(+), 252 deletions(-) delete mode 100644 src/ARMJIT_RegCache.h create mode 100644 src/ARMJIT_RegisterCache.h (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h deleted file mode 100644 index 556d27b..0000000 --- a/src/ARMJIT_RegCache.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef ARMJIT_REGCACHE_H -#define ARMJIT_REGCACHE_H - -#include "ARMJIT.h" - -// TODO: replace this in the future -#include "dolphin/BitSet.h" - -#include - -namespace ARMJIT -{ - -template -class RegCache -{ -public: - RegCache() - {} - - RegCache(T* compiler, FetchedInstr instrs[], int instrsCount) - : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) - { - for (int i = 0; i < 16; i++) - Mapping[i] = (Reg)-1; - } - - void UnloadRegister(int reg) - { - assert(Mapping[reg] != -1); - - if (DirtyRegs & (1 << reg)) - Compiler->SaveReg(reg, Mapping[reg]); - - DirtyRegs &= ~(1 << reg); - LoadedRegs &= ~(1 << reg); - NativeRegsUsed &= ~(1 << (int)Mapping[reg]); - Mapping[reg] = (Reg)-1; - } - - void LoadRegister(int reg) - { - assert(Mapping[reg] == -1); - for (int i = 0; i < NativeRegsAvailable; i++) - { - Reg nativeReg = NativeRegAllocOrder[i]; - if (!(NativeRegsUsed & (1 << nativeReg))) - { - Mapping[reg] = nativeReg; - NativeRegsUsed |= 1 << (int)nativeReg; - LoadedRegs |= 1 << reg; - - Compiler->LoadReg(reg, nativeReg); - - return; - } - } - - assert("Welp!"); - } - - void Flush() - { - BitSet16 loadedSet(LoadedRegs); - for (int reg : loadedSet) - UnloadRegister(reg); - } - - void Prepare(int i) - { - u16 futureNeeded = 0; - int ranking[16]; - for (int j = 0; j < 16; j++) - ranking[j] = 0; - for (int j = i; j < InstrsCount; j++) - { - BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); - futureNeeded |= regsNeeded.m_val; - for (int reg : regsNeeded) - ranking[reg]++; - } - - // we'll unload all registers which are never used again - BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded); - for (int reg : neverNeededAgain) - UnloadRegister(reg); - - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; - BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); - if (needToBeLoaded != BitSet16(0)) - { - int neededCount = needToBeLoaded.Count(); - BitSet16 loadedSet(LoadedRegs); - while (loadedSet.Count() + neededCount > NativeRegsAvailable) - { - int leastReg = -1; - int rank = 1000; - for (int reg : loadedSet) - { - if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank) - { - leastReg = reg; - rank = ranking[reg]; - } - } - - assert(leastReg != -1); - UnloadRegister(leastReg); - - loadedSet.m_val = LoadedRegs; - } - - for (int reg : needToBeLoaded) - LoadRegister(reg); - } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); - } - - static const Reg NativeRegAllocOrder[]; - static const int NativeRegsAvailable; - - Reg Mapping[16]; - u32 NativeRegsUsed = 0; - u16 LoadedRegs = 0; - u16 DirtyRegs = 0; - - T* Compiler; - - FetchedInstr* Instrs; - int InstrsCount; -}; - -} - -#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h new file mode 100644 index 0000000..04c1eda --- /dev/null +++ b/src/ARMJIT_RegisterCache.h @@ -0,0 +1,136 @@ +#ifndef ARMJIT_REGCACHE_H +#define ARMJIT_REGCACHE_H + +#include "ARMJIT.h" + +// TODO: replace this in the future +#include "dolphin/BitSet.h" + +#include + +namespace ARMJIT +{ + +template +class RegisterCache +{ +public: + RegisterCache() + {} + + RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount) + : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) + { + for (int i = 0; i < 16; i++) + Mapping[i] = (Reg)-1; + } + + void UnloadRegister(int reg) + { + assert(Mapping[reg] != -1); + + if (DirtyRegs & (1 << reg)) + Compiler->SaveReg(reg, Mapping[reg]); + + DirtyRegs &= ~(1 << reg); + LoadedRegs &= ~(1 << reg); + NativeRegsUsed &= ~(1 << (int)Mapping[reg]); + Mapping[reg] = (Reg)-1; + } + + void LoadRegister(int reg) + { + assert(Mapping[reg] == -1); + for (int i = 0; i < NativeRegsAvailable; i++) + { + Reg nativeReg = NativeRegAllocOrder[i]; + if (!(NativeRegsUsed & (1 << nativeReg))) + { + Mapping[reg] = nativeReg; + NativeRegsUsed |= 1 << (int)nativeReg; + LoadedRegs |= 1 << reg; + + Compiler->LoadReg(reg, nativeReg); + + return; + } + } + + assert("Welp!"); + } + + void Flush() + { + BitSet16 loadedSet(LoadedRegs); + for (int reg : loadedSet) + UnloadRegister(reg); + } + + void Prepare(int i) + { + u16 futureNeeded = 0; + int ranking[16]; + for (int j = 0; j < 16; j++) + ranking[j] = 0; + for (int j = i; j < InstrsCount; j++) + { + BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); + futureNeeded |= regsNeeded.m_val; + for (int reg : regsNeeded) + ranking[reg]++; + } + + // we'll unload all registers which are never used again + BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded); + for (int reg : neverNeededAgain) + UnloadRegister(reg); + + FetchedInstr Instr = Instrs[i]; + u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); + if (needToBeLoaded != BitSet16(0)) + { + int neededCount = needToBeLoaded.Count(); + BitSet16 loadedSet(LoadedRegs); + while (loadedSet.Count() + neededCount > NativeRegsAvailable) + { + int leastReg = -1; + int rank = 1000; + for (int reg : loadedSet) + { + if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank) + { + leastReg = reg; + rank = ranking[reg]; + } + } + + assert(leastReg != -1); + UnloadRegister(leastReg); + + loadedSet.m_val = LoadedRegs; + } + + for (int reg : needToBeLoaded) + LoadRegister(reg); + } + DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + } + + static const Reg NativeRegAllocOrder[]; + static const int NativeRegsAvailable; + + Reg Mapping[16]; + u32 NativeRegsUsed = 0; + u16 LoadedRegs = 0; + u16 DirtyRegs = 0; + + T* Compiler; + + FetchedInstr* Instrs; + int InstrsCount; +}; + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index b7358a2..4fe0c70 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,20 +9,20 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = +const X64Reg RegisterCache::NativeRegAllocOrder[] = { #ifdef _WIN32 - RBX, RSI, RDI, R12, R13 + RBX, RSI, RDI, R12, R13, R14 #else - RBX, R12, R13 + RBX, R12, R13, R14 // this is sad #endif }; template <> -const int RegCache::NativeRegsAvailable = +const int RegisterCache::NativeRegsAvailable = #ifdef _WIN32 - 5 + 6 #else - 3 + 4 #endif ; @@ -39,10 +39,47 @@ Compiler::Compiler() MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); } } + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) + { + MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j); + MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false); + MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); + } ResetStart = GetWritableCodePtr(); } +void* Compiler::Gen_ChangeCPSRRoutine() +{ + void* res = (void*)GetWritableCodePtr(); + + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + + SetJumpTarget(fiq); + + SetJumpTarget(irq); + + SetJumpTarget(svc); + + SetJumpTarget(abt); + + SetJumpTarget(und); + + return res; +} + DataRegion Compiler::ClassifyAddress(u32 addr) { if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase) @@ -106,12 +143,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); - XOR(32, R(RCycles), R(RCycles)); LoadCPSR(); // TODO: this is ugly as a whole, do better - RegCache = ARMJIT::RegCache(this, instrs, instrsCount); + RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) { @@ -242,7 +278,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs RegCache.Flush(); SaveCPSR(); - LEA(32, RAX, MDisp(RCycles, ConstantCycles)); + MOV(32, R(RAX), Imm32(ConstantCycles)); ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); RET(); @@ -306,18 +342,20 @@ CompileFunc Compiler::GetCompFunc(int kind) NULL, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // STRB + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDR + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDRB + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRH A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRD - NULL, NULL, NULL, NULL, - // STRD - NULL, NULL, NULL, NULL, + // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // LDRH A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSB @@ -360,10 +398,14 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, // LDR/STR half imm offset T_Comp_MemImmHalf, T_Comp_MemImmHalf, - // branch, etc. - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL + // LDR/STR sp rel + NULL, NULL, + // PUSH/POP + NULL, NULL, + // LDMIA, STMIA + NULL, NULL, + NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL }; return Thumb ? T_Comp[kind] : A_Comp[kind]; @@ -376,7 +418,7 @@ void Compiler::Comp_AddCycles_C() : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); if (CurInstr.Cond() < 0xE) - ADD(32, R(RCycles), Imm8(cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -388,13 +430,15 @@ void Compiler::Comp_AddCycles_CI(u32 i) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; if (CurInstr.Cond() < 0xE) - ADD(32, R(RCycles), Imm8(cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { + // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert + // wird der alte Wert gespeichert SaveCPSR(); MOV(64, R(ABI_PARAM1), R(RCPU)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9395a29..a751737 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,7 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" -#include "../ARMJIT_RegCache.h" +#include "../ARMJIT_RegisterCache.h" #include @@ -12,7 +12,6 @@ namespace ARMJIT { const Gen::X64Reg RCPU = Gen::RBP; -const Gen::X64Reg RCycles = Gen::R14; const Gen::X64Reg RCPSR = Gen::R15; const Gen::X64Reg RSCRATCH = Gen::EAX; @@ -72,6 +71,7 @@ private: void A_Comp_MemWB(); void A_Comp_MemHalf(); + void A_Comp_LDM_STM(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -86,8 +86,13 @@ private: void T_Comp_MemImm(); void T_Comp_MemRegHalf(); void T_Comp_MemImmHalf(); + void T_Comp_LoadPCRel(); + void T_Comp_MemSPRel(); + void T_Comp_PUSH_POP(); + void T_Comp_LDMIA_STMIA(); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -100,6 +105,11 @@ private: void* Gen_MemoryRoutine9(bool store, int size); void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); + void* Gen_MemoryRoutineSeq9(bool store, bool preinc); + void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); + + void* Gen_ChangeCPSRRoutine(); + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -122,11 +132,14 @@ private: void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; + void* MemoryFuncsSeq9[2][2]; + void* MemoryFuncsSeq7[2][2][2]; + bool CPSRDirty = false; FetchedInstr CurInstr; - RegCache RegCache; + RegisterCache RegCache; bool Thumb; u32 Num; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 69746e2..20e1893 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -3,16 +3,6 @@ #include "../GPU.h" #include "../Wifi.h" -namespace NDS -{ -extern u8* SWRAM_ARM9; -extern u32 SWRAM_ARM9Mask; -extern u8* SWRAM_ARM7; -extern u32 SWRAM_ARM7Mask; -extern u8 ARM7WRAM[]; -extern u16 ARM7BIOSProt; -} - using namespace Gen; namespace ARMJIT @@ -41,6 +31,49 @@ int squeezePointer(T* ptr) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) code cycles - ABI_PARAM3 */ + +#define CALC_CYCLES_9(numC, numD, scratch) \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ + CMP(32, R(numC), R(numD)); \ + CMOVcc(32, numD, R(numC), CC_G); \ + CMP(32, R(numD), R(scratch)); \ + CMOVcc(32, scratch, R(numD), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); +#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ + if (codeMainRAM) \ + { \ + LEA(32, scratch, MRegSum(numD, numC)); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } \ + else \ + { \ + if (!store) \ + ADD(32, R(numC), Imm8(1)); \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ + CMP(32, R(numD), R(numC)); \ + CMOVcc(32, numC, R(numD), CC_G); \ + CMP(32, R(numC), R(scratch)); \ + CMOVcc(32, scratch, R(numC), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } +#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ + if (codeMainRAM) \ + { \ + if (!store) \ + ADD(32, R(numD), Imm8(1)); \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ + CMP(32, R(numD), R(numC)); \ + CMOVcc(32, numC, R(numD), CC_G); \ + CMP(32, R(numC), R(scratch)); \ + CMOVcc(32, scratch, R(numC), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } \ + else \ + { \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } + void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -56,15 +89,10 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) FixupBranch insideITCM = J_CC(CC_B); // cycle counting! - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0))); - LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6)); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - CMP(32, R(ABI_PARAM4), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); + MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); + SHR(32, R(ABI_PARAM4), Imm8(12)); + MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); + CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) if (store) { @@ -101,7 +129,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, R(RCycles), R(ABI_PARAM3)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -120,7 +148,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, R(RCycles), R(ABI_PARAM3)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) @@ -158,28 +186,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) MOV(32, R(RSCRATCH), R(ABI_PARAM1)); SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); + MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); AND(32, R(RSCRATCH), Imm32(0xFF000000)); CMP(32, R(RSCRATCH), Imm32(0x02000000)); FixupBranch outsideMainRAM = J_CC(CC_NE); - if (codeMainRAM) - { - LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3)); - ADD(32, R(RCycles), R(RSCRATCH)); - } - else - { - if (!store) - ADD(32, R(ABI_PARAM3), Imm8(1)); - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); - CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); - CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); - } + CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); if (store) @@ -205,22 +218,7 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) RET(); SetJumpTarget(outsideMainRAM); - if (codeMainRAM) - { - if (!store) - ADD(32, R(ABI_PARAM4), Imm8(1)); - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); - CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); - CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); - } - else - { - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1)); - ADD(32, R(RCycles), R(RSCRATCH)); - } + CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) if (store) { if (size > 8) @@ -257,7 +255,189 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) return res; } -void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size) +#define MEMORY_SEQ_WHILE_COND \ + if (!store) \ + MOV(32, currentElement, R(EAX));\ + if (!preinc) \ + ADD(32, R(ABI_PARAM1), Imm8(4)); \ + \ + SUB(32, R(ABI_PARAM3), Imm8(1)); \ + J_CC(CC_NZ, repeat); + +/* + ABI_PARAM1 address + ABI_PARAM2 address where registers are stored + ABI_PARAM3 how many values to read/write + ABI_PARAM4 code cycles + + Dolphin x64CodeEmitter is my favourite assembler + */ +void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) +{ + const u8* zero = GetCodePtr(); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); + RET(); + + void* res = (void*)GetWritableCodePtr(); + + TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); + J_CC(CC_Z, zero); + + PUSH(ABI_PARAM3); + PUSH(ABI_PARAM4); // we need you later + + const u8* repeat = GetCodePtr(); + + if (preinc) + ADD(32, R(ABI_PARAM1), Imm8(4)); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch insideDTCM = J_CC(CC_B); + + CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + + OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster + + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + AND(32, R(ABI_PARAM1), Imm8(~3)); + if (store) + { + MOV(32, R(ABI_PARAM2), currentElement); + CALL((void*)NDS::ARM9Write32); + } + else + CALL((void*)NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(12)); + MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); + MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); + + FixupBranch finishIt1 = J(); + + SetJumpTarget(insideDTCM); + AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); + if (store) + { + MOV(32, R(ABI_PARAM4), currentElement); + MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4)); + } + else + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time + MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential + FixupBranch finishIt2 = J(); + + SetJumpTarget(insideITCM); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3)); + if (store) + { + MOV(32, R(ABI_PARAM4), currentElement); + MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); + XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); + MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); + MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + } + else + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), Imm32(1)); + MOV(32, R(ABI_PARAM2), Imm32(1)); + + SetJumpTarget(finishIt1); + SetJumpTarget(finishIt2); + + POP(ABI_PARAM4); + POP(ABI_PARAM3); + + CMP(32, R(ABI_PARAM3), Imm8(1)); + FixupBranch skipSequential = J_CC(CC_E); + SUB(32, R(ABI_PARAM3), Imm8(1)); + IMUL(32, R(ABI_PARAM3)); + ADD(32, R(ABI_PARAM2), R(RSCRATCH)); + SetJumpTarget(skipSequential); + + CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + return res; +} + +void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) +{ + const u8* zero = GetCodePtr(); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); + RET(); + + void* res = (void*)GetWritableCodePtr(); + + TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); + J_CC(CC_Z, zero); + + PUSH(ABI_PARAM3); + PUSH(ABI_PARAM4); // we need you later + + const u8* repeat = GetCodePtr(); + + if (preinc) + ADD(32, R(ABI_PARAM1), Imm8(4)); + + OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); + + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + AND(32, R(ABI_PARAM1), Imm8(~3)); + if (store) + { + MOV(32, R(ABI_PARAM2), currentElement); + CALL((void*)NDS::ARM7Write32); + } + else + CALL((void*)NDS::ARM7Read32); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(15)); + MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); + MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + + POP(ABI_PARAM4); + POP(ABI_PARAM3); + + CMP(32, R(ABI_PARAM3), Imm8(1)); + FixupBranch skipSequential = J_CC(CC_E); + SUB(32, R(ABI_PARAM3), Imm8(1)); + IMUL(32, R(ABI_PARAM3)); + ADD(32, R(ABI_PARAM2), R(RSCRATCH)); + SetJumpTarget(skipSequential); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0xFF000000)); + CMP(32, R(RSCRATCH), Imm32(0x02000000)); + FixupBranch outsideMainRAM = J_CC(CC_NE); + CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + SetJumpTarget(outsideMainRAM); + CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + return res; +} + +#undef CALC_CYCLES_9 +#undef MEMORY_SEQ_WHILE_COND + +void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) { if (store) MOV(32, R(ABI_PARAM2), rd); @@ -278,6 +458,129 @@ void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int si } } +s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + int regsCount = regs.Count(); + + const u8 userModeOffsets[] = + { + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0, + + offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]), + offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0, + }; + + if (decrement) + { + MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4)); + preinc = !preinc; + } + else + MOV(32, R(ABI_PARAM1), rb); + + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + u32 cycles = Num + ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM4), Imm32(cycles)); + if (!store) + { + SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + MOV(64, R(ABI_PARAM2), R(RSP)); + + CALL(Num == 0 + ? MemoryFuncsSeq9[0][preinc] + : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + + for (int reg = 15; reg >= 0; reg--) + { + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + MOV(32, R(RSCRATCH2), R(RCPSR)); + AND(32, R(RSCRATCH2), Imm8(0x1F)); + // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! + MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + POP(RSCRATCH); + MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH)); + } + else if (RegCache.Mapping[reg] == INVALID_REG) + { + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); + } + else + { + if (reg != 15) + RegCache.DirtyRegs |= (1 << reg); + POP(MapReg(reg).GetSimpleReg()); + } + } + } + + if (regs[15]) + { + if (Num == 1) + OR(32, MapReg(15), Imm8(1)); + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + } + else + { + for (int reg : regs) + { + if (usermode && reg >= 8 && reg < 15) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! + MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH)); + PUSH(RSCRATCH); + } + else if (RegCache.Mapping[reg] == INVALID_REG) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + PUSH(MapReg(reg).GetSimpleReg()); + } + MOV(64, R(ABI_PARAM2), R(RSP)); + + CALL(Num == 0 + ? MemoryFuncsSeq9[1][preinc] + : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); + + ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + } + + return (regsCount * 4) * (decrement ? -1 : 1); +} + OpArg Compiler::A_Comp_GetMemWBOffset() { if (!(CurInstr.Instr & (1 << 25))) @@ -354,6 +657,25 @@ void Compiler::A_Comp_MemHalf() ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) : MapReg(CurInstr.A_Reg(0)); + int op = (CurInstr.Instr >> 5) & 0x3; + bool load = CurInstr.Instr & (1 << 20); + + bool signExtend = false; + int size; + if (!load) + { + size = op == 1 ? 16 : 32; + load = op == 2; + } + else if (load) + { + size = op == 2 ? 8 : 16; + signExtend = op > 1; + } + + if (size == 32 && Num == 1) + return; // NOP + if (CurInstr.Instr & (1 << 24)) { if (CurInstr.Instr & (1 << 23)) @@ -370,19 +692,6 @@ void Compiler::A_Comp_MemHalf() else MOV(32, R(ABI_PARAM1), rn); - int op = (CurInstr.Instr >> 5) & 0x3; - bool load = CurInstr.Instr & (1 << 20); - - bool signExtend = false; - int size; - if (!load && op == 1) - size = 16; - else if (load) - { - size = op == 2 ? 8 : 16; - signExtend = op > 1; - } - if (!(CurInstr.Instr & (1 << 24))) { if (CurInstr.Instr & (1 << 23)) @@ -412,6 +721,24 @@ void Compiler::T_Comp_MemReg() Comp_MemAccess(rd, false, !load, byte ? 8 : 32); } +void Compiler::A_Comp_LDM_STM() +{ + BitSet16 regs(CurInstr.Instr & 0xFFFF); + + bool load = (CurInstr.Instr >> 20) & 1; + bool pre = (CurInstr.Instr >> 24) & 1; + bool add = (CurInstr.Instr >> 23) & 1; + bool writeback = (CurInstr.Instr >> 21) & 1; + bool usermode = (CurInstr.Instr >> 22) & 1; + + OpArg rn = MapReg(CurInstr.A_Reg(16)); + + s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false); + + if (writeback) + ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset)); +} + void Compiler::T_Comp_MemImm() { OpArg rd = MapReg(CurInstr.T_Reg(0)); @@ -456,4 +783,56 @@ void Compiler::T_Comp_MemImmHalf() Comp_MemAccess(rd, false, !load, 16); } +void Compiler::T_Comp_LoadPCRel() +{ + OpArg rd = MapReg(CurInstr.T_Reg(8)); + u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + + // hopefully this doesn't break + u32 val; CurCPU->DataRead32(addr, &val); + MOV(32, rd, Imm32(val)); +} + +void Compiler::T_Comp_MemSPRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) * 4; + OpArg rd = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + + LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); + + Comp_MemAccess(rd, false, !load, 32); +} + +void Compiler::T_Comp_PUSH_POP() +{ + bool load = CurInstr.Instr & (1 << 11); + BitSet16 regs(CurInstr.Instr & 0xFF); + if (CurInstr.Instr & (1 << 8)) + { + if (load) + regs[15] = true; + else + regs[14] = true; + } + + OpArg sp = MapReg(13); + + s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false); + + ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max +} + +void Compiler::T_Comp_LDMIA_STMIA() +{ + BitSet16 regs(CurInstr.Instr & 0xFF); + OpArg rb = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + + s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false); + + if (!load || !regs[CurInstr.T_Reg(8)]) + ADD(32, rb, Imm8(offset)); +} + } \ No newline at end of file diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 32a9645..c519229 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -25,9 +25,7 @@ enum { A_Link = 1 << 10, - A_LDMSTM = 1 << 11, - - A_ARM9Only = 1 << 12, + A_UnkOnARM7 = 1 << 11, }; #define A_BIOP A_Read16 @@ -97,12 +95,12 @@ const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy); -const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ); +const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ); -const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD); -const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB); -const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD); -const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB); +const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD); +const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB); +const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); +const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 #define A_STR A_Read12 @@ -144,8 +142,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP); const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB); -const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM); +const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); +const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -154,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); -const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM); -const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG); -const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS); -const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR); -const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC); +const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM); +const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG); +const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS); +const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR); +const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC); const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB @@ -249,7 +247,7 @@ const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); @@ -320,8 +318,10 @@ Info Decode(bool thumb, u32 num, u32 instr) if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; - if (data & A_ARM9Only && num != 0) - data |= A_BranchAlways | A_Link; + if (data & A_UnkOnARM7 && num != 0) + data = A_UNK; + + res.Kind = (data >> 13) & 0x1FF; if (data & A_Read0) res.SrcRegs |= 1 << (instr & 0xF); @@ -360,14 +360,8 @@ Info Decode(bool thumb, u32 num, u32 instr) res.SrcRegs |= 1 << 15; } - if (data & A_LDMSTM) - { - res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15); - if (instr & (1 << 21)) - res.DstRegs |= 1 << ((instr >> 16) & 0xF); - } - - res.Kind = (data >> 13) & 0x1FF; + if (res.Kind == ak_LDM) + res.DstRegs |= instr & (1 << 15); // this is right return res; } -- cgit v1.2.3 From c58fdbd66bab9f1b97e9522afa5436f212540b6d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 11 Jul 2019 16:22:47 +0200 Subject: jit: branch instructions --- src/ARM.cpp | 12 +- src/ARMJIT.cpp | 4 +- src/ARMJIT.h | 2 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 267 ++++++++++++++++++++++++++++++++++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 185 ++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 30 ++-- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 42 +----- src/ARM_InstrInfo.cpp | 6 +- src/ARM_InstrInfo.h | 1 + src/CMakeLists.txt | 1 + 10 files changed, 363 insertions(+), 187 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_Branch.cpp (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index f7ca26d..aca876d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -521,11 +521,8 @@ void ARMv5::Execute() printf("aaarg ungempappter raum %x\n", R[15]);*/ ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); - if (block == NULL) - ARMJIT::CompileBlock(this); - else - Cycles += block(); - + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + // TODO optimize this shit!!! if (Halted) { @@ -607,10 +604,7 @@ void ARMv4::Execute() printf("aaarg ungempappter raum %x\n", R[15]);*/ ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); - if (block == NULL) - ARMJIT::CompileBlock(this); - else - Cycles += block(); + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); // TODO optimize this shit!!! if (Halted) diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 6afa967..47b425f 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -121,7 +121,7 @@ void DeInit() delete compiler; } -void CompileBlock(ARM* cpu) +CompiledBlock CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -171,6 +171,8 @@ void CompileBlock(ARM* cpu) CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block); + + return block; } void ResetBlocks() diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 71188f9..45bb4ed 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func) void Init(); void DeInit(); -void CompileBlock(ARM* cpu); +CompiledBlock CompileBlock(ARM* cpu); void ResetBlocks(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp new file mode 100644 index 0000000..fb2acba --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -0,0 +1,267 @@ +#include "ARMJIT_Compiler.h" + +using namespace Gen; + +namespace ARMJIT +{ + +void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) +{ + // we can simplify constant branches by a lot + // it's not completely safe to assume stuff like, which instructions to preload + // we'll see how it works out + + u32 newPC; + u32 nextInstr[2]; + u32 cycles = 0; + bool setupRegion = false; + + if (addr & 0x1 && !Thumb) + { + CPSRDirty = true; + OR(32, R(RCPSR), Imm8(0x20)); + } + else if (!(addr & 0x1) && Thumb) + { + CPSRDirty = true; + AND(32, R(RCPSR), Imm32(~0x20)); + } + + if (Num == 0) + { + ARMv5* cpu9 = (ARMv5*)CurCPU; + + u32 oldregion = R15 >> 24; + u32 newregion = addr >> 24; + + u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; + cpu9->RegionCodeCycles = regionCodeCycles; + + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); + + setupRegion = newregion != oldregion; + if (setupRegion) + cpu9->SetupCodeMem(addr); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + // two-opcodes-at-once fetch + // doesn't matter if we put garbage in the MSbs there + if (addr & 0x2) + { + nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16; + cycles += CurCPU->CodeCycles; + nextInstr[1] = cpu9->CodeRead32(addr+2, false); + cycles += CurCPU->CodeCycles; + } + else + { + nextInstr[0] = cpu9->CodeRead32(addr, true); + nextInstr[1] = nextInstr[0] >> 16; + cycles += CurCPU->CodeCycles; + } + } + else + { + addr &= ~0x3; + newPC = addr+4; + + nextInstr[0] = cpu9->CodeRead32(addr, true); + cycles += cpu9->CodeCycles; + nextInstr[1] = cpu9->CodeRead32(addr+4, false); + cycles += cpu9->CodeCycles; + } + } + else + { + ARMv4* cpu7 = (ARMv4*)CurCPU; + + u32 codeRegion = addr >> 24; + u32 codeCycles = addr >> 15; // cheato + + cpu7->CodeRegion = codeRegion; + cpu7->CodeCycles = codeCycles; + + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles)); + + if (addr & 0x1) + { + addr &= ~0x1; + newPC = addr+2; + + nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr); + nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2); + cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; + } + else + { + addr &= ~0x3; + newPC = addr+4; + + nextInstr[0] = cpu7->CodeRead32(addr); + nextInstr[1] = cpu7->CodeRead32(addr+4); + cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; + } + } + + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1])); + if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) + ConstantCycles += cycles; + else + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + + if (setupRegion) + { + MOV(32, R(ABI_PARAM1), R(RCPU)); + MOV(32, R(ABI_PARAM2), Imm32(newPC)); + CALL((void*)&ARMv5::SetupCodeMem); + } +} + +void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) +{ + BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000); + bool previouslyDirty = CPSRDirty; + SaveCPSR(); + + if (restoreCPSR) + { + if (Thumb || CurInstr.Cond() >= 0xE) + { + for (int reg : hiRegsLoaded) + RegCache.UnloadRegister(reg); + } + else + { + // the ugly way... + // we only save them, to load and save them again + for (int reg : hiRegsLoaded) + SaveReg(reg, RegCache.Mapping[reg]); + } + } + + MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(32, R(ABI_PARAM2), R(addr)); + if (!restoreCPSR) + XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); + else + MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); + if (Num == 0) + CALL((void*)&ARMv5::JumpTo); + else + CALL((void*)&ARMv4::JumpTo); + + if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) + { + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } + + if (previouslyDirty) + LoadCPSR(); + CPSRDirty = previouslyDirty; +} + +void Compiler::A_Comp_BranchImm() +{ + int op = (CurInstr.Instr >> 24) & 1; + s32 offset = (s32)(CurInstr.Instr << 8) >> 6; + u32 target = R15 + offset; + bool link = op; + + if (CurInstr.Cond() == 0xF) // BLX_imm + { + target += (op << 1) + 1; + link = true; + } + + if (link) + MOV(32, MapReg(14), Imm32(R15 - 4)); + + Comp_JumpTo(target); +} + +void Compiler::A_Comp_BranchXchangeReg() +{ + OpArg rn = MapReg(CurInstr.A_Reg(0)); + if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg + MOV(32, MapReg(14), Imm32(R15 - 4)); + Comp_JumpTo(rn.GetSimpleReg()); +} + +void Compiler::T_Comp_BCOND() +{ + u32 cond = (CurInstr.Instr >> 8) & 0xF; + FixupBranch skipExecute = CheckCondition(cond); + + s32 offset = (s32)(CurInstr.Instr << 24) >> 23; + Comp_JumpTo(R15 + offset + 1, true); + + FixupBranch skipFailed = J(); + SetJumpTarget(skipExecute); + Comp_AddCycles_C(true); + SetJumpTarget(skipFailed); +} + +void Compiler::T_Comp_B() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20; + Comp_JumpTo(R15 + offset + 1); +} + +void Compiler::T_Comp_BranchXchangeReg() +{ + bool link = CurInstr.Instr & (1 << 7); + if (link && Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + + OpArg rn = MapReg(CurInstr.A_Reg(3)); + if (link) + MOV(32, MapReg(14), Imm32(R15 - 1)); + Comp_JumpTo(rn.GetSimpleReg()); +} + +void Compiler::T_Comp_BL_LONG_1() +{ + s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9; + MOV(32, MapReg(14), Imm32(R15 + offset)); + Comp_AddCycles_C(); +} + +void Compiler::T_Comp_BL_LONG_2() +{ + OpArg lr = MapReg(14); + s32 offset = (CurInstr.Instr & 0x7FF) << 1; + LEA(32, RSCRATCH, MDisp(lr.GetSimpleReg(), offset)); + MOV(32, lr, Imm32((R15 - 2) | 1)); + if (Num == 1 || CurInstr.Instr & (1 << 12)) + OR(32, R(RSCRATCH), Imm8(1)); + Comp_JumpTo(RSCRATCH); +} + +void Compiler::T_Comp_BL_Merged(FetchedInstr part1) +{ + assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1); + Comp_AddCycles_C(); + + u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9); + target += (CurInstr.Instr & 0x7FF) << 1; + + if (Num == 1 || CurInstr.Instr & (1 << 12)) + target |= 1; + + MOV(32, MapReg(14), Imm32((R15 - 2) | 1)); + + Comp_JumpTo(target); +} + +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 4fe0c70..6799a90 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -50,50 +50,6 @@ Compiler::Compiler() ResetStart = GetWritableCodePtr(); } -void* Compiler::Gen_ChangeCPSRRoutine() -{ - void* res = (void*)GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - CMP(32, R(RSCRATCH), Imm8(0x11)); - FixupBranch fiq = J_CC(CC_E); - CMP(32, R(RSCRATCH), Imm8(0x12)); - FixupBranch irq = J_CC(CC_E); - CMP(32, R(RSCRATCH), Imm8(0x13)); - FixupBranch svc = J_CC(CC_E); - CMP(32, R(RSCRATCH), Imm8(0x17)); - FixupBranch abt = J_CC(CC_E); - CMP(32, R(RSCRATCH), Imm8(0x1B)); - FixupBranch und = J_CC(CC_E); - - SetJumpTarget(fiq); - - SetJumpTarget(irq); - - SetJumpTarget(svc); - - SetJumpTarget(abt); - - SetJumpTarget(und); - - return res; -} - -DataRegion Compiler::ClassifyAddress(u32 addr) -{ - if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase) - return dataRegionDTCM; - switch (addr & 0xFF000000) - { - case 0x02000000: return dataRegionMainRAM; - case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM; - case 0x04000000: return dataRegionIO; - case 0x06000000: return dataRegionVRAM; - } - return dataRegionGeneric; -} - void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -123,6 +79,29 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); } +// invalidates RSCRATCH and RSCRATCH3 +Gen::FixupBranch Compiler::CheckCondition(u32 cond) +{ + if (cond >= 0x8) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(RSCRATCH3), R(RCPSR)); + SHR(32, R(RSCRATCH3), Imm8(28)); + MOV(32, R(RSCRATCH), Imm32(1)); + SHL(32, R(RSCRATCH), R(RSCRATCH3)); + TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); + + return J_CC(CC_Z); + } + else + { + // could have used a LUT, but then where would be the fun? + TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); + + return J_CC(cond & 1 ? CC_NZ : CC_Z); + } +} + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) @@ -140,6 +119,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CodeRegion = cpu->CodeRegion; CurCPU = cpu; + bool mergedThumbBL = false; + ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -167,17 +148,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1])); } - if (comp == NULL || CurInstr.Info.Branches()) + if (comp == NULL) SaveCPSR(); } - - // run interpreter - cpu->CodeCycles = CurInstr.CodeCycles; - cpu->R[15] = R15; - cpu->CurInstr = CurInstr.Instr; - cpu->NextInstr[0] = CurInstr.NextInstr[0]; - cpu->NextInstr[1] = CurInstr.NextInstr[1]; - + if (comp != NULL) RegCache.Prepare(i); else @@ -185,58 +159,44 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; - if (comp == NULL) + if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1 + && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2) + mergedThumbBL = true; + else { - MOV(64, R(ABI_PARAM1), R(RCPU)); + u32 icode = (CurInstr.Instr >> 6) & 0x3FF; + if (comp == NULL) + { + MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + } + else if (mergedThumbBL) + T_Comp_BL_Merged(instrs[i - 1]); + else + (this->*comp)(); } - else - (this->*comp)(); - - ARMInterpreter::THUMBInstrTable[icode](cpu); } else { u32 cond = CurInstr.Cond(); if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) { - MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::A_BLX_IMM); - - ARMInterpreter::A_BLX_IMM(cpu); + if (comp) + (this->*comp)(); + else + { + MOV(64, R(ABI_PARAM1), R(RCPU)); + ABI_CallFunction(ARMInterpreter::A_BLX_IMM); + } } else if (cond == 0xF) - { Comp_AddCycles_C(); - cpu->AddCycles_C(); - } else { FixupBranch skipExecute; if (cond < 0xE) - { - if (cond >= 0x8) - { - static_assert(RSCRATCH3 == ECX); - MOV(32, R(RSCRATCH3), R(RCPSR)); - SHR(32, R(RSCRATCH3), Imm8(28)); - MOV(32, R(RSCRATCH), Imm32(1)); - SHL(32, R(RSCRATCH), R(RSCRATCH3)); - TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - - skipExecute = J_CC(CC_Z); - } - else - { - // could have used a LUT, but then where would be the fun? - TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - - skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z); - } - - } + skipExecute = CheckCondition(cond); u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) @@ -258,19 +218,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs SetJumpTarget(skipFailed); } - - if (cpu->CheckCondition(cond)) - ARMInterpreter::ARMInstrTable[icode](cpu); - else - cpu->AddCycles_C(); } } - /* - we don't need to collect the interpreted cycles, - since cpu->Cycles is taken into account by the dispatcher. - */ - if (comp == NULL && i != instrsCount - 1) LoadCPSR(); } @@ -367,7 +317,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // LDM/STM NULL, NULL, // Branch - NULL, NULL, NULL, NULL, NULL, + A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg, // system stuff NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -389,7 +339,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // pc/sp relative T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP, // LDR pcrel - NULL, + T_Comp_LoadPCRel, // LDR/STR reg offset T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, // LDR/STR sign extended, half @@ -399,25 +349,27 @@ CompileFunc Compiler::GetCompFunc(int kind) // LDR/STR half imm offset T_Comp_MemImmHalf, T_Comp_MemImmHalf, // LDR/STR sp rel - NULL, NULL, + T_Comp_MemSPRel, T_Comp_MemSPRel, // PUSH/POP - NULL, NULL, + T_Comp_PUSH_POP, T_Comp_PUSH_POP, // LDMIA, STMIA - NULL, NULL, - NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL + T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, + // Branch + T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, + // Unk, SVC + NULL, NULL }; return Thumb ? T_Comp[kind] : A_Comp[kind]; } -void Compiler::Comp_AddCycles_C() +void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (CurInstr.Cond() < 0xE) + if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; @@ -429,25 +381,10 @@ void Compiler::Comp_AddCycles_CI(u32 i) NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; - if (CurInstr.Cond() < 0xE) + if (!Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } -void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) -{ - // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert - // wird der alte Wert gespeichert - SaveCPSR(); - - MOV(64, R(ABI_PARAM1), R(RCPU)); - MOV(32, R(ABI_PARAM2), R(addr)); - MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); - if (Num == 0) - CALL((void*)&ARMv5::JumpTo); - else - CALL((void*)&ARMv4::JumpTo); -} - } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index a751737..45b488a 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -22,19 +22,6 @@ class Compiler; typedef void (Compiler::*CompileFunc)(); -enum DataRegion -{ - dataRegionGeneric, // hey, that's me! - dataRegionMainRAM, - dataRegionSWRAM, - dataRegionVRAM, - dataRegionIO, - dataRegionExclusive, - dataRegionsCount, - dataRegionDTCM = dataRegionExclusive, - dataRegionWRAM7 = dataRegionExclusive, -}; - class Compiler : public Gen::X64CodeBlock { public: @@ -49,8 +36,9 @@ private: CompileFunc GetCompFunc(int kind); void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); + void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); - void Comp_AddCycles_C(); + void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); enum @@ -63,8 +51,6 @@ private: opInvertOp2 = 1 << 5, }; - DataRegion ClassifyAddress(u32 addr); - void A_Comp_Arith(); void A_Comp_MovOp(); void A_Comp_CmpOp(); @@ -73,6 +59,9 @@ private: void A_Comp_MemHalf(); void A_Comp_LDM_STM(); + void A_Comp_BranchImm(); + void A_Comp_BranchXchangeReg(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); @@ -91,6 +80,13 @@ private: void T_Comp_PUSH_POP(); void T_Comp_LDMIA_STMIA(); + void T_Comp_BCOND(); + void T_Comp_B(); + void T_Comp_BranchXchangeReg(); + void T_Comp_BL_LONG_1(); + void T_Comp_BL_LONG_2(); + void T_Comp_BL_Merged(FetchedInstr prefix); + void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); @@ -119,6 +115,8 @@ private: void LoadCPSR(); void SaveCPSR(); + Gen::FixupBranch CheckCondition(u32 cond); + Gen::OpArg MapReg(int reg) { if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG) diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 20e1893..69b324c 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -462,38 +462,10 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei { int regsCount = regs.Count(); - const u8 userModeOffsets[] = - { - offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), - offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0, - - offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]), - offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0, - - offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), - offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0, - - offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), - offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0, - - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - - offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), - offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0, - - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - - offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), - offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0, - }; - if (decrement) { MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4)); - preinc = !preinc; + preinc ^= true; } else MOV(32, R(ABI_PARAM1), rb); @@ -516,16 +488,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei { if (regs[reg]) { - if (usermode && reg >= 8 && reg < 15) + /*if (usermode && reg >= 8 && reg < 15) { MOV(32, R(RSCRATCH2), R(RCPSR)); AND(32, R(RSCRATCH2), Imm8(0x1F)); // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); POP(RSCRATCH); MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH)); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else */if (RegCache.Mapping[reg] == INVALID_REG) { assert(reg != 15); @@ -552,16 +524,16 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei { for (int reg : regs) { - if (usermode && reg >= 8 && reg < 15) + /*if (usermode && reg >= 8 && reg < 15) { MOV(32, R(RSCRATCH), R(RCPSR)); AND(32, R(RSCRATCH), Imm8(0x1F)); // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH)); PUSH(RSCRATCH); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else */if (RegCache.Mapping[reg] == INVALID_REG) { LoadReg(reg, RSCRATCH); PUSH(RSCRATCH); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index c519229..b8dff00 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -255,7 +255,7 @@ const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); -const u32 T_BLX_REG = T_BranchAlways | T_ReadR15 | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG); +const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG); const u32 T_B = T_BranchAlways | tk(tk_B); const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1); const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2); @@ -301,6 +301,10 @@ Info Decode(bool thumb, u32 num, u32 instr) res.DstRegs |= (1 << 13); if (data & T_ReadR15) res.SrcRegs |= (1 << 15); + if (data & T_WriteR14) + res.DstRegs |= (1 << 14); + if (data & T_ReadR14) + res.SrcRegs |= (1 << 14); if (data & T_BranchAlways) res.DstRegs |= (1 << 15); diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index dcd938b..51dcfa2 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -202,6 +202,7 @@ enum tk_POP, tk_LDMIA, tk_STMIA, + tk_BCOND, tk_BX, tk_BLX_REG, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 662ed5c..9401220 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,6 +35,7 @@ add_library(core STATIC ARMJIT_x64/ARMJIT_Compiler.cpp ARMJIT_x64/ARMJIT_ALU.cpp ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp dolphin/CommonFuncs.cpp dolphin/x64ABI.cpp -- cgit v1.2.3 From 2efab201e936ab0f60baf1de8e957080141d2d93 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 12 Jul 2019 03:43:45 +0200 Subject: jit: LDM/STM finally(!) working + MUL, MLA and CLZ --- src/ARM.cpp | 7 +++ src/ARMJIT_x64/ARMJIT_ALU.cpp | 74 +++++++++++++++++++++++ src/ARMJIT_x64/ARMJIT_Branch.cpp | 7 +-- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 108 +++++++++++++++++++++++++++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 14 ++++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 116 +++++++++++++++++++++++++----------- 6 files changed, 279 insertions(+), 47 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index aca876d..a77fbc4 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -80,8 +80,15 @@ ARMv4::ARMv4() : ARM(1) // } +namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];} + void ARM::Reset() { + FILE* blabla = fopen("fhhg", "w"); + for (int i = 0; i < ARMInstrInfo::ak_Count; i++) + fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]); + fclose(blabla); + Cycles = 0; Halted = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index c22751e..cbe67fd 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -223,6 +223,73 @@ void Compiler::A_Comp_MovOp() Comp_JumpTo(rd.GetSimpleReg(), S); } +void Compiler::A_Comp_CLZ() +{ + OpArg rd = MapReg(CurInstr.A_Reg(12)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + + MOV(32, R(RSCRATCH), Imm32(32)); + TEST(32, rm, rm); + FixupBranch skipZero = J_CC(CC_Z); + BSR(32, RSCRATCH, rm); + XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH + SetJumpTarget(skipZero); + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn) +{ + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1); + } + + static_assert(EAX == RSCRATCH); + MOV(32, R(RSCRATCH), rm); + if (add) + { + IMUL(32, RSCRATCH, rs); + LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg())); + TEST(32, rd, rd); + } + else + { + IMUL(32, RSCRATCH, rs); + MOV(32, rd, R(RSCRATCH)); + TEST(32, R(RSCRATCH), R(RSCRATCH)); + } + + if (S) + Comp_RetriveFlags(false, false, false); +} + +void Compiler::A_Comp_MUL_MLA() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn; + if (add) + rn = MapReg(CurInstr.A_Reg(12)); + + Comp_MulOp(S, add, rd, rm, rs, rn); +} + void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { CPSRDirty = true; @@ -455,6 +522,13 @@ void Compiler::T_Comp_ALU_Imm8() } } +void Compiler::T_Comp_MUL() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + Comp_MulOp(true, false, rd, rd, rs, Imm8(-1)); +} + void Compiler::T_Comp_ALU() { OpArg rd = MapReg(CurInstr.T_Reg(0)); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index fb2acba..bd01ffb 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -126,17 +126,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000); + BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); bool previouslyDirty = CPSRDirty; SaveCPSR(); if (restoreCPSR) { if (Thumb || CurInstr.Cond() >= 0xE) - { - for (int reg : hiRegsLoaded) - RegCache.UnloadRegister(reg); - } + RegCache.Flush(); else { // the ugly way... diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 6799a90..8a895d1 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -26,10 +26,14 @@ const int RegisterCache::NativeRegsAvailable = #endif ; +int instructionPopularityARM[ARMInstrInfo::ak_Count]; + Compiler::Compiler() { AllocCodeSpace(1024 * 1024 * 16); + memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); + for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) @@ -47,7 +51,88 @@ Compiler::Compiler() MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); } - ResetStart = GetWritableCodePtr(); + { + // RSCRATCH mode + // ABI_PARAM2 reg number + // ABI_PARAM3 value in current mode + // ret - ABI_PARAM3 + ReadBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + RET(); + + SetJumpTarget(fiq); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ))); + RET(); + SetJumpTarget(irq); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ))); + RET(); + SetJumpTarget(svc); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC))); + RET(); + SetJumpTarget(abt); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT))); + RET(); + SetJumpTarget(und); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); + RET(); + } + { + // RSCRATCH mode + // ABI_PARAM2 reg n + // ABI_PARAM3 value + // carry flag set if the register isn't banked + WriteBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + STC(); + RET(); + + SetJumpTarget(fiq); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(irq); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(svc); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(abt); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(und); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3)); + CLC(); + RET(); + } + + ResetStart = (void*)GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -136,6 +221,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); + + if (!Thumb) + instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; if (comp == NULL || i == instrsCount - 1) { @@ -287,9 +375,9 @@ CompileFunc Compiler::GetCompFunc(int kind) // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // Mul - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff - NULL, NULL, NULL, NULL, NULL, + A_Comp_CLZ, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -315,7 +403,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // swap NULL, NULL, // LDM/STM - NULL, NULL, + A_Comp_LDM_STM, A_Comp_LDM_STM, // Branch A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg, // system stuff @@ -333,7 +421,7 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU, // hi reg T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative @@ -387,4 +475,14 @@ void Compiler::Comp_AddCycles_CI(u32 i) ConstantCycles += cycles; } +void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + LEA(32, RSCRATCH, MDisp(i, add + cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 45b488a..89dfe28 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -40,6 +40,7 @@ private: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); + void Comp_AddCycles_CI(Gen::X64Reg i, int add); enum { @@ -55,6 +56,10 @@ private: void A_Comp_MovOp(); void A_Comp_CmpOp(); + void A_Comp_MUL_MLA(); + + void A_Comp_CLZ(); + void A_Comp_MemWB(); void A_Comp_MemHalf(); void A_Comp_LDM_STM(); @@ -62,11 +67,13 @@ private: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_MUL(); void T_Comp_RelAddr(); void T_Comp_AddSP(); @@ -88,7 +95,7 @@ private: void T_Comp_BL_Merged(FetchedInstr prefix); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); - s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -96,6 +103,8 @@ private: Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed); + void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn); + void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); void* Gen_MemoryRoutine9(bool store, int size); @@ -133,6 +142,9 @@ private: void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; + void* ReadBanked; + void* WriteBanked; + bool CPSRDirty = false; FetchedInstr CurInstr; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 69b324c..8fbcafd 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,7 +1,5 @@ #include "ARMJIT_Compiler.h" -#include "../GPU.h" -#include "../Wifi.h" using namespace Gen; @@ -362,7 +360,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) CMP(32, R(ABI_PARAM3), Imm8(1)); FixupBranch skipSequential = J_CC(CC_E); SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, R(ABI_PARAM3)); + IMUL(32, RSCRATCH, R(ABI_PARAM3)); ADD(32, R(ABI_PARAM2), R(RSCRATCH)); SetJumpTarget(skipSequential); @@ -413,10 +411,11 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) POP(ABI_PARAM4); POP(ABI_PARAM3); + // TODO: optimise this CMP(32, R(ABI_PARAM3), Imm8(1)); FixupBranch skipSequential = J_CC(CC_E); SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, R(ABI_PARAM3)); + IMUL(32, RSCRATCH, R(ABI_PARAM3)); ADD(32, R(ABI_PARAM2), R(RSCRATCH)); SetJumpTarget(skipSequential); @@ -458,25 +457,35 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) } } -s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +void printStuff2(u32 a, u32 b) { + printf("b %x %x\n", a, b); +} + +s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + FILE* f; + const u8* start = GetCodePtr(); + int regsCount = regs.Count(); if (decrement) { - MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4)); + MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); preinc ^= true; } else - MOV(32, R(ABI_PARAM1), rb); + MOV(32, R(ABI_PARAM1), MapReg(rn)); + + s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - u32 cycles = Num + u32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -484,20 +493,29 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei ? MemoryFuncsSeq9[0][preinc] : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + bool firstUserMode = true; for (int reg = 15; reg >= 0; reg--) { if (regs[reg]) { - /*if (usermode && reg >= 8 && reg < 15) + if (usermode && reg >= 8 && reg < 15) { - MOV(32, R(RSCRATCH2), R(RCPSR)); - AND(32, R(RSCRATCH2), Imm8(0x1F)); - // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); - POP(RSCRATCH); - MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH)); + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); + POP(ABI_PARAM3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); + SaveReg(reg, ABI_PARAM3); + SetJumpTarget(sucessfulWritten); } - else */if (RegCache.Mapping[reg] == INVALID_REG) + else if (RegCache.Mapping[reg] == INVALID_REG) { assert(reg != 15); @@ -516,32 +534,48 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei if (regs[15]) { if (Num == 1) - OR(32, MapReg(15), Imm8(1)); + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); } } else { + bool firstUserMode = true; for (int reg : regs) { - /*if (usermode && reg >= 8 && reg < 15) + if (usermode && reg >= 8 && reg < 15) { - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); - MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH)); - PUSH(RSCRATCH); + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, ABI_PARAM3); + else + MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg])); + MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(ABI_PARAM3); } - else */if (RegCache.Mapping[reg] == INVALID_REG) + else if (RegCache.Mapping[reg] == INVALID_REG) { LoadReg(reg, RSCRATCH); PUSH(RSCRATCH); } else + { PUSH(MapReg(reg).GetSimpleReg()); + } } MOV(64, R(ABI_PARAM2), R(RSP)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); CALL(Num == 0 ? MemoryFuncsSeq9[1][preinc] @@ -550,7 +584,14 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); } - return (regsCount * 4) * (decrement ? -1 : 1); + if (usermode && !store) + { + f= fopen("ldm", "a"); + fwrite(start, GetCodePtr() - start, 1, f); + fclose(f); + } + + return offset; } OpArg Compiler::A_Comp_GetMemWBOffset() @@ -697,16 +738,20 @@ void Compiler::A_Comp_LDM_STM() { BitSet16 regs(CurInstr.Instr & 0xFFFF); - bool load = (CurInstr.Instr >> 20) & 1; - bool pre = (CurInstr.Instr >> 24) & 1; - bool add = (CurInstr.Instr >> 23) & 1; - bool writeback = (CurInstr.Instr >> 21) & 1; - bool usermode = (CurInstr.Instr >> 22) & 1; + bool load = CurInstr.Instr & (1 << 20); + bool pre = CurInstr.Instr & (1 << 24); + bool add = CurInstr.Instr & (1 << 23); + bool writeback = CurInstr.Instr & (1 << 21); + bool usermode = CurInstr.Instr & (1 << 22); OpArg rn = MapReg(CurInstr.A_Reg(16)); - s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false); + s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode); + if (load && writeback && regs[CurInstr.A_Reg(16)]) + writeback = Num == 0 + ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1)) + : false; if (writeback) ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset)); } @@ -789,8 +834,7 @@ void Compiler::T_Comp_PUSH_POP() } OpArg sp = MapReg(13); - - s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false); + s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false); ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max } @@ -801,7 +845,7 @@ void Compiler::T_Comp_LDMIA_STMIA() OpArg rb = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false); + s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false); if (!load || !regs[CurInstr.T_Reg(8)]) ADD(32, rb, Imm8(offset)); -- cgit v1.2.3 From 9b3c14b58abd987d9eb992b04f1f10ee8a6c91f7 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 12 Jul 2019 16:42:42 +0200 Subject: jit: SMULL and SMLAL --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 56 ++++++++++++++++++++++++++++++++++++-- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 + 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index cbe67fd..4afafed 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -290,6 +290,59 @@ void Compiler::A_Comp_MUL_MLA() Comp_MulOp(S, add, rd, rm, rs, rn); } +void Compiler::A_Comp_SMULL_SMLAL() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn = MapReg(CurInstr.A_Reg(12)); + + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, 2); + } + + MOVSX(64, 32, RSCRATCH2, rm); + MOVSX(64, 32, RSCRATCH3, rs); + if (add) + { + MOV(32, R(RSCRATCH), rd); + SHL(64, R(RSCRATCH), Imm8(32)); + OR(64, R(RSCRATCH), rn); + + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + ADD(64, R(RSCRATCH2), R(RSCRATCH)); + } + else + { + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + if (S) + TEST(64, R(RSCRATCH2), R(RSCRATCH2)); + } + + if (S) + Comp_RetriveFlags(false, false, false); + + MOV(32, rn, R(RSCRATCH2)); + SHR(64, R(RSCRATCH2), Imm8(32)); + MOV(32, rd, R(RSCRATCH2)); +} + void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { CPSRDirty = true; @@ -302,9 +355,6 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); } - if (carryUsed == 983298) - printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr); - SETcc(CC_S, R(RSCRATCH)); SETcc(CC_Z, R(RSCRATCH3)); LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 8a895d1..b6dd529 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -375,7 +375,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // Mul - A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff A_Comp_CLZ, NULL, NULL, NULL, NULL, // STR diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 89dfe28..f9bc227 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -57,6 +57,7 @@ private: void A_Comp_CmpOp(); void A_Comp_MUL_MLA(); + void A_Comp_SMULL_SMLAL(); void A_Comp_CLZ(); -- cgit v1.2.3 From 6f0dcad4f66d752f777a28e456967e638a0c8a79 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 12 Jul 2019 17:01:10 +0200 Subject: jit: fix wrongly placed const --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index b6dd529..e043f58 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -328,7 +328,7 @@ CompileFunc Compiler::GetCompFunc(int kind) { // this might look like waste of space, so many repeatitions, but it's invaluable for debugging. // see ARMInstrInfo.h for the order - const CompileFunc A_Comp[ARMInstrInfo::ak_Count] = + CompileFunc const A_Comp[ARMInstrInfo::ak_Count] = { // AND A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, @@ -410,7 +410,7 @@ CompileFunc Compiler::GetCompFunc(int kind) NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; - const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { + CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = { // Shift imm T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm, // Three operand ADD/SUB -- cgit v1.2.3 From dcf6e1cad2b38dc4fe0dcbdb789f92e01f802a4a Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 02:37:32 +0200 Subject: jit: fix linux --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 48 +++--- src/ARMJIT_x64/ARMJIT_Branch.cpp | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 288 +++++++++++++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 8 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 15 +- src/dolphin/Log.h | 13 +- src/dolphin/MemoryUtil.cpp | 13 +- 7 files changed, 193 insertions(+), 194 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 4afafed..013f54c 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -154,13 +154,13 @@ void Compiler::A_Comp_Arith() switch (op) { case 0x0: // AND - Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0x1: // EOR - Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0x2: // SUB - Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); break; case 0x3: // RSB if (op2.IsZero()) @@ -172,25 +172,25 @@ void Compiler::A_Comp_Arith() Comp_RetriveFlags(true, true, false); } else - Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); break; case 0x4: // ADD - Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); break; case 0x5: // ADC - Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); + Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); break; case 0x6: // SBC - Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); break; case 0x7: // RSC - Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); + Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); break; case 0xC: // ORR - Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0xE: // BIC - Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); break; default: assert("unimplemented"); @@ -392,11 +392,11 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b { void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL; if (op == 0) - shiftOp = SHL; + shiftOp = &Compiler::SHL; else if (op == 1) - shiftOp = SHR; + shiftOp = &Compiler::SHR; else if (op == 2) - shiftOp = SAR; + shiftOp = &Compiler::SAR; CMP(32, R(ECX), Imm8(32)); FixupBranch lt32 = J_CC(CC_L); @@ -539,9 +539,9 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); if (op & 1) - Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else - Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); } void Compiler::T_Comp_ALU_Imm8() @@ -564,10 +564,10 @@ void Compiler::T_Comp_ALU_Imm8() Comp_CmpOp(2, rd, imm, false); return; case 0x2: - Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); return; case 0x3: - Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); return; } } @@ -594,10 +594,10 @@ void Compiler::T_Comp_ALU() switch (op) { case 0x0: // AND - Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0x1: // EOR - Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0x2: case 0x3: @@ -613,10 +613,10 @@ void Compiler::T_Comp_ALU() } return; case 0x5: // ADC - Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); return; case 0x6: // SBC - Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); return; case 0x8: // TST Comp_CmpOp(0, rd, rs, false); @@ -634,10 +634,10 @@ void Compiler::T_Comp_ALU() Comp_CmpOp(3, rd, rs, false); return; case 0xC: // ORR - Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0xE: // BIC - Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); return; case 0xF: // MVN if (rd != rs) @@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg() switch (op) { case 0x0: // ADD - Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); break; case 0x1: // CMP Comp_CmpOp(2, rdMapped, rs, false); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index bd01ffb..05c8ec6 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -118,7 +118,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if (setupRegion) { - MOV(32, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM1), R(RCPU)); MOV(32, R(ABI_PARAM2), Imm32(newPC)); CALL((void*)&ARMv5::SetupCodeMem); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index e043f58..2b7ccd2 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -4,6 +4,12 @@ #include +#ifdef _WIN32 +#else +#include +#include +#endif + using namespace Gen; namespace ARMJIT @@ -28,9 +34,34 @@ const int RegisterCache::NativeRegsAvailable = int instructionPopularityARM[ARMInstrInfo::ak_Count]; +/* + We'll repurpose this .bss memory + + */ +u8 CodeMemory[1024 * 1024 * 32]; + Compiler::Compiler() { - AllocCodeSpace(1024 * 1024 * 16); +#ifdef _WIN32 +#else + u64 pagesize = sysconf(_SC_PAGE_SIZE); +#endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned; + +#ifdef _WIN32 +#else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); +#endif + + region = pageAligned; + region_size = alignedSize; + total_region_size = region_size; + + ClearCodeSpace(); + + SetCodePtr(pageAligned); memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); @@ -187,6 +218,124 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) } } +#define F(x) &Compiler::x +const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = +{ + // AND + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // EOR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SUB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADD + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SBC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ORR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MOV + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // BIC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MVN + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // TST + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // TEQ + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMP + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMN + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // Mul + F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + F(A_Comp_CLZ), NULL, NULL, NULL, NULL, + // STR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSB + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // swap + NULL, NULL, + // LDM/STM + F(A_Comp_LDM_STM), F(A_Comp_LDM_STM), + // Branch + F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), + // system stuff + NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { + // Shift imm + F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), + // Three operand ADD/SUB + F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), + // 8 bit imm + F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), + // general ALU + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU), + // hi reg + F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), + // pc/sp relative + F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP), + // LDR pcrel + F(T_Comp_LoadPCRel), + // LDR/STR reg offset + F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), + // LDR/STR sign extended, half + F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), + // LDR/STR imm offset + F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), + // LDR/STR half imm offset + F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf), + // LDR/STR sp rel + F(T_Comp_MemSPRel), F(T_Comp_MemSPRel), + // PUSH/POP + F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), + // LDMIA, STMIA + F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), + // Branch + F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), + // Unk, SVC + NULL, NULL +}; +#undef F + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) @@ -206,7 +355,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); + ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -220,8 +369,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; - CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); - + CompileFunc comp = Thumb + ? T_Comp[CurInstr.Info.Kind] + : A_Comp[CurInstr.Info.Kind]; + if (!Thumb) instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; @@ -318,139 +469,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); RET(); return res; } -CompileFunc Compiler::GetCompFunc(int kind) -{ - // this might look like waste of space, so many repeatitions, but it's invaluable for debugging. - // see ARMInstrInfo.h for the order - CompileFunc const A_Comp[ARMInstrInfo::ak_Count] = - { - // AND - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // EOR - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // SUB - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // RSB - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ADD - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ADC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // SBC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // RSC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ORR - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // MOV - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - // BIC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // MVN - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - // TST - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // TEQ - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // CMP - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // CMN - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // Mul - A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL, - // ARMv5 stuff - A_Comp_CLZ, NULL, NULL, NULL, NULL, - // STR - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - // STRB - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // LDR - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // LDRB - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // STRH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - // LDRH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRSB - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRSH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // swap - NULL, NULL, - // LDM/STM - A_Comp_LDM_STM, A_Comp_LDM_STM, - // Branch - A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg, - // system stuff - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - - CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = { - // Shift imm - T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm, - // Three operand ADD/SUB - T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, - // 8 bit imm - T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, - // general ALU - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU, - // hi reg - T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, - // pc/sp relative - T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP, - // LDR pcrel - T_Comp_LoadPCRel, - // LDR/STR reg offset - T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, - // LDR/STR sign extended, half - T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, - // LDR/STR imm offset - T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, - // LDR/STR half imm offset - T_Comp_MemImmHalf, T_Comp_MemImmHalf, - // LDR/STR sp rel - T_Comp_MemSPRel, T_Comp_MemSPRel, - // PUSH/POP - T_Comp_PUSH_POP, T_Comp_PUSH_POP, - // LDMIA, STMIA - T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, - // Branch - T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, - // Unk, SVC - NULL, NULL - }; - - return Thumb ? T_Comp[kind] : A_Comp[kind]; -} - void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index f9bc227..e04f96a 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -6,8 +6,6 @@ #include "../ARMJIT.h" #include "../ARMJIT_RegisterCache.h" -#include - namespace ARMJIT { @@ -18,9 +16,6 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; -class Compiler; - -typedef void (Compiler::*CompileFunc)(); class Compiler : public Gen::X64CodeBlock { @@ -32,8 +27,7 @@ public: void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); -private: - CompileFunc GetCompFunc(int kind); + typedef void (Compiler::*CompileFunc)(); void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 8fbcafd..15a40f8 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -464,9 +464,6 @@ void printStuff2(u32 a, u32 b) s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { - FILE* f; - const u8* start = GetCodePtr(); - int regsCount = regs.Count(); if (decrement) @@ -482,11 +479,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc u32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); MOV(64, R(ABI_PARAM2), R(RSP)); CALL(Num == 0 @@ -581,14 +579,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ? MemoryFuncsSeq9[1][preinc] : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); - ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); - } - - if (usermode && !store) - { - f= fopen("ldm", "a"); - fwrite(start, GetCodePtr() - start, 1, f); - fclose(f); + ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); } return offset; diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h index 21e69a5..a7f4b6a 100644 --- a/src/dolphin/Log.h +++ b/src/dolphin/Log.h @@ -4,12 +4,13 @@ #include -#define PanicAlert(msg) \ - do \ - { \ - printf("%s\n", msg); \ - Crash(); \ - } while (false) +#define PanicAlert(fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + abort(); \ + } while (false) + #define DYNA_REC 0 diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp index 01cb897..7273a8a 100644 --- a/src/dolphin/MemoryUtil.cpp +++ b/src/dolphin/MemoryUtil.cpp @@ -6,15 +6,9 @@ #include #include -#define PanicAlert(fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - abort(); \ - } while (false) - #include "../types.h" #include "CommonFuncs.h" +#include "Log.h" #ifdef _WIN32 #include @@ -39,8 +33,6 @@ namespace Common void* AllocateExecutableMemory(size_t size) { - printf("c\n"); - #if defined(_WIN32) void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); #else @@ -50,13 +42,10 @@ void* AllocateExecutableMemory(size_t size) if (ptr == MAP_FAILED) ptr = nullptr; #endif - printf("a\n"); if (ptr == nullptr) PanicAlert("Failed to allocate executable memory"); - printf("b\n"); - return ptr; } -- cgit v1.2.3 From 9d76d63af5d496e232018d6ddf8ee1e55ad440ad Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 04:33:36 +0200 Subject: jit: make everything configurable --- src/ARM.cpp | 127 ++++++++++++++++++++++++++++++++----- src/ARM.h | 3 + src/ARMJIT.cpp | 21 ++++-- src/ARMJIT.h | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 14 ++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 + src/Config.cpp | 6 ++ src/Config.h | 3 + src/NDS.cpp | 26 +++++++- src/libui_sdl/DlgEmuSettings.cpp | 16 +++++ src/libui_sdl/PlatformConfig.cpp | 1 + src/libui_sdl/main.cpp | 17 ++--- 12 files changed, 192 insertions(+), 46 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index a77fbc4..6cc80c0 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -489,7 +489,7 @@ void ARMv5::Execute() while (NDS::ARM9Timestamp < NDS::ARM9Target) { - /*if (CPSR & 0x20) // THUMB + if (CPSR & 0x20) // THUMB { // prefetch R[15] += 2; @@ -522,14 +522,8 @@ void ARMv5::Execute() } else AddCycles_C(); - }*/ - - /*if (!ARMJIT::IsMapped(0, R[15] - ((CPSR&0x20)?2:4))) - printf("aaarg ungempappter raum %x\n", R[15]);*/ - - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); - + } + // TODO optimize this shit!!! if (Halted) { @@ -554,6 +548,58 @@ void ARMv5::Execute() Halted = 0; } +void ARMv5::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(0)) + { + Halted = 0; + if (NDS::IME[0] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM9Timestamp = NDS::ARM9Target; + return; + } + } + + while (NDS::ARM9Timestamp < NDS::ARM9Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + if (!ARMJIT::IsMapped(0, instrAddr)) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + printf("ARMv5 PC in non executable region %08X\n", R[15]); + return; + } + + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr); + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + if (Halted) + { + if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; + } + if (IRQ) TriggerIRQ(); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + } + + if (Halted == 2) + Halted = 0; +} + void ARMv4::Execute() { if (Halted) @@ -577,7 +623,7 @@ void ARMv4::Execute() while (NDS::ARM7Timestamp < NDS::ARM7Target) { - /*if (CPSR & 0x20) // THUMB + if (CPSR & 0x20) // THUMB { // prefetch R[15] += 2; @@ -605,13 +651,7 @@ void ARMv4::Execute() } else AddCycles_C(); - }*/ - - /*if (!ARMJIT::IsMapped(1, R[15] - ((CPSR&0x20)?2:4))) - printf("aaarg ungempappter raum %x\n", R[15]);*/ - - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + } // TODO optimize this shit!!! if (Halted) @@ -636,3 +676,56 @@ void ARMv4::Execute() if (Halted == 2) Halted = 0; } + +void ARMv4::ExecuteJIT() +{ + if (Halted) + { + if (Halted == 2) + { + Halted = 0; + } + else if (NDS::HaltInterrupted(1)) + { + Halted = 0; + if (NDS::IME[1] & 0x1) + TriggerIRQ(); + } + else + { + NDS::ARM7Timestamp = NDS::ARM7Target; + return; + } + } + + while (NDS::ARM7Timestamp < NDS::ARM7Target) + { + u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); + if (!ARMJIT::IsMapped(1, instrAddr)) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + printf("ARMv4 PC in non executable region %08X\n", R[15]); + return; + } + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr); + Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + // TODO optimize this shit!!! + if (Halted) + { + if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; + } + + if (IRQ) TriggerIRQ(); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; + } + + if (Halted == 2) + Halted = 0; +} \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index b9f5d89..0544301 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -52,6 +52,7 @@ public: } virtual void Execute() = 0; + virtual void ExecuteJIT() = 0; bool CheckCondition(u32 code) { @@ -151,6 +152,7 @@ public: void DataAbort(); void Execute(); + void ExecuteJIT(); // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -269,6 +271,7 @@ public: void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); + void ExecuteJIT(); u16 CodeRead16(u32 addr) { diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 47b425f..e8e6be0 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -2,6 +2,8 @@ #include +#include "Config.h" + #include "ARMJIT_x64/ARMJIT_Compiler.h" namespace ARMJIT @@ -125,18 +127,21 @@ CompiledBlock CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; - FetchedInstr instrs[12]; + if (Config::JIT_MaxBlockSize < 1) + Config::JIT_MaxBlockSize = 1; + if (Config::JIT_MaxBlockSize > 32) + Config::JIT_MaxBlockSize = 32; + + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 r15Initial = cpu->R[15]; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; - //printf("block %x %d\n", r15, thumb); do { r15 += thumb ? 2 : 4; instrs[i].Instr = nextInstr[0]; - //printf("%x %x\n", instrs[i].Instr, r15); instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; if (cpu->Num == 0) @@ -166,16 +171,16 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; - } while(!instrs[i - 1].Info.Branches() && i < 10); + } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize); CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); - InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block); + InsertBlock(cpu->Num, blockAddr, block); return block; } -void ResetBlocks() +void InvalidateBlockCache() { memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); @@ -185,6 +190,8 @@ void ResetBlocks() memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + + compiler->Reset(); } } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 45bb4ed..004256c 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -111,7 +111,7 @@ void DeInit(); CompiledBlock CompileBlock(ARM* cpu); -void ResetBlocks(); +void InvalidateBlockCache(); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 2b7ccd2..fe23859 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -336,13 +336,15 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { }; #undef F +void Compiler::Reset() +{ + SetCodePtr((u8*)ResetStart); +} + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) - { - ResetBlocks(); - SetCodePtr((u8*)ResetStart); - } + InvalidateBlockCache(); CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); @@ -355,7 +357,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); + ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -469,7 +471,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); return res; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index e04f96a..cd58012 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -22,6 +22,8 @@ class Compiler : public Gen::X64CodeBlock public: Compiler(); + void Reset(); + CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); diff --git a/src/Config.cpp b/src/Config.cpp index f558ef6..37b701c 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -34,6 +34,9 @@ int Threaded3D; int GL_ScaleFactor; int GL_Antialias; +bool JIT_Enable = false; +int JIT_MaxBlockSize = 12; + ConfigEntry ConfigFile[] = { {"3DRenderer", 0, &_3DRenderer, 1, NULL, 0}, @@ -42,6 +45,9 @@ ConfigEntry ConfigFile[] = {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0}, {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0}, + {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, + {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 84fd57b..18a7910 100644 --- a/src/Config.h +++ b/src/Config.h @@ -46,6 +46,9 @@ extern int Threaded3D; extern int GL_ScaleFactor; extern int GL_Antialias; +extern bool JIT_Enable; +extern int JIT_MaxBlockSize; + } #endif // CONFIG_H diff --git a/src/NDS.cpp b/src/NDS.cpp index baa5e0d..4b50d9c 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -524,7 +524,7 @@ void Reset() KeyCnt = 0; RCnt = 0; - ARMJIT::ResetBlocks(); + ARMJIT::InvalidateBlockCache(); NDSCart::Reset(); GBACart::Reset(); @@ -741,6 +741,11 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } + if (!file->Saving) + { + ARMJIT::InvalidateBlockCache(); + } + return true; } @@ -826,6 +831,7 @@ void RunSystem(u64 timestamp) } } +template u32 RunFrame() { FrameStartTimestamp = SysTimestamp; @@ -858,7 +864,10 @@ u32 RunFrame() } else { - ARM9->Execute(); + if (EnableJIT) + ARM9->ExecuteJIT(); + else + ARM9->Execute(); } RunTimers(0); @@ -880,7 +889,10 @@ u32 RunFrame() } else { - ARM7->Execute(); + if (EnableJIT) + ARM7->ExecuteJIT(); + else + ARM7->Execute(); } RunTimers(1); @@ -910,6 +922,14 @@ u32 RunFrame() return GPU::TotalScanlines; } +u32 RunFrame() +{ + if (Config::JIT_Enable) + return RunFrame(); + else + return RunFrame(); +} + void Reschedule(u64 target) { if (CurCPU == 0) diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 0ccaed7..116d2da 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -57,10 +57,20 @@ void OnOk(uiButton* btn, void* blarg) { Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); + Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled); + long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10); + if (blockSize < 1) + blockSize = 1; + if (blockSize > 32) + blockSize = 32; + Config::JIT_MaxBlockSize = blockSize; + Config::Save(); uiControlDestroy(uiControl(win)); opened = false; + + ApplyNewSettings(4); } void OnJITStateChanged(uiCheckbox* cb, void* blarg) @@ -143,6 +153,12 @@ void Open() uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot); + uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable); + { + char maxBlockSizeStr[10]; + sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize); + uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); + } OnJITStateChanged(cbJITEnabled, NULL); uiControlShow(uiControl(win)); diff --git a/src/libui_sdl/PlatformConfig.cpp b/src/libui_sdl/PlatformConfig.cpp index f78b195..b6d1e8d 100644 --- a/src/libui_sdl/PlatformConfig.cpp +++ b/src/libui_sdl/PlatformConfig.cpp @@ -64,6 +64,7 @@ char MicWavPath[512]; char LastROMFolder[512]; +bool EnableJIT; ConfigEntry PlatformConfigFile[] = { diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index 8e8bf9e..d6809c3 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -48,6 +48,7 @@ #include "../Wifi.h" #include "../Platform.h" #include "../Config.h" +#include "../ARMJIT.h" #include "../Savestate.h" @@ -2408,19 +2409,11 @@ void ApplyNewSettings(int type) GPU3D::InitRenderer(Screen_UseGL); if (Screen_UseGL) uiGLMakeContextCurrent(NULL); } - /*else if (type == 4) // vsync + else if (type == 4) { - if (Screen_UseGL) - { - uiGLMakeContextCurrent(GLContext); - uiGLSetVSync(Config::ScreenVSync); - uiGLMakeContextCurrent(NULL); - } - else - { - // TODO eventually: VSync for non-GL screen? - } - }*/ + if (Config::JIT_Enable) + ARMJIT::InvalidateBlockCache(); + } EmuRunning = prevstatus; } -- cgit v1.2.3 From 411fb57c07c732a2b60e3566ae045f8f60eea29d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 19:24:00 +0200 Subject: jit: add compile option --- CMakeLists.txt | 30 +++++++++++++++++++ src/ARM.cpp | 13 ++++---- src/ARM.h | 6 ++++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 61 +++++++++++++++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 - src/CMakeLists.txt | 25 +++++++++------- src/CP15.cpp | 12 ++++++-- src/Config.cpp | 4 +++ src/Config.h | 2 ++ src/NDS.cpp | 26 ++++++++++++++++ src/dolphin/CodeBlock.h | 3 -- src/libui_sdl/DlgEmuSettings.cpp | 21 +++++++++++-- src/libui_sdl/main.cpp | 2 ++ 13 files changed, 151 insertions(+), 55 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/CMakeLists.txt b/CMakeLists.txt index 048dd44..d59e19c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,36 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +include(CheckSymbolExists) +function(detect_architecture symbol arch) + if (NOT DEFINED ARCHITECTURE) + set(CMAKE_REQUIRED_QUIET 1) + check_symbol_exists("${symbol}" "" ARCHITECTURE_${arch}) + unset(CMAKE_REQUIRED_QUIET) + + # The output variable needs to be unique across invocations otherwise + # CMake's crazy scope rules will keep it defined + if (ARCHITECTURE_${arch}) + set(ARCHITECTURE "${arch}" PARENT_SCOPE) + set(ARCHITECTURE_${arch} 1 PARENT_SCOPE) + add_definitions(-DARCHITECTURE_${arch}=1) + endif() + endif() +endfunction() + +detect_architecture("__x86_64__" x86_64) +detect_architecture("__i386__" x86) +detect_architecture("__arm__" ARM) +detect_architecture("__aarch64__" ARM64) + +if (ARCHITECTURE STREQUAL x86_64) + option(ENABLE_JIT "Enable x64 JIT recompiler" ON) +endif() + +if (ENABLE_JIT) + add_definitions(-DJIT_ENABLED) +endif() + if (CMAKE_BUILD_TYPE STREQUAL Release) option(ENABLE_LTO "Enable link-time optimization" ON) else() diff --git a/src/ARM.cpp b/src/ARM.cpp index 6cc80c0..eb58d02 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -80,15 +80,8 @@ ARMv4::ARMv4() : ARM(1) // } -namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];} - void ARM::Reset() { - FILE* blabla = fopen("fhhg", "w"); - for (int i = 0; i < ARMInstrInfo::ak_Count; i++) - fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]); - fclose(blabla); - Cycles = 0; Halted = 0; @@ -548,6 +541,7 @@ void ARMv5::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv5::ExecuteJIT() { if (Halted) @@ -599,6 +593,7 @@ void ARMv5::ExecuteJIT() if (Halted == 2) Halted = 0; } +#endif void ARMv4::Execute() { @@ -677,6 +672,7 @@ void ARMv4::Execute() Halted = 0; } +#ifdef JIT_ENABLED void ARMv4::ExecuteJIT() { if (Halted) @@ -728,4 +724,5 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; -} \ No newline at end of file +} +#endif \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index 0544301..ecdf5b4 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -52,7 +52,9 @@ public: } virtual void Execute() = 0; +#ifdef ENABLE_JIT virtual void ExecuteJIT() = 0; +#endif bool CheckCondition(u32 code) { @@ -152,7 +154,9 @@ public: void DataAbort(); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); @@ -271,7 +275,9 @@ public: void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); +#ifdef JIT_ENABLED void ExecuteJIT(); +#endif u16 CodeRead16(u32 addr) { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fe23859..18cb27e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -4,7 +4,10 @@ #include +#include "../dolphin/CommonFuncs.h" + #ifdef _WIN32 +#include #else #include #include @@ -32,8 +35,6 @@ const int RegisterCache::NativeRegsAvailable = #endif ; -int instructionPopularityARM[ARMInstrInfo::ak_Count]; - /* We'll repurpose this .bss memory @@ -42,29 +43,33 @@ u8 CodeMemory[1024 * 1024 * 32]; Compiler::Compiler() { -#ifdef _WIN32 -#else - u64 pagesize = sysconf(_SC_PAGE_SIZE); -#endif - - u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize); - u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned; - -#ifdef _WIN32 -#else - mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); -#endif - - region = pageAligned; - region_size = alignedSize; - total_region_size = region_size; + { + #ifdef _WIN32 + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + + u64 pageSize = (u64)sysInfo.dwPageSize; + #else + u64 pageSize = sysconf(_SC_PAGE_SIZE); + #endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned; + + #ifdef _WIN32 + DWORD dummy; + VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); + #endif + + region = pageAligned; + region_size = alignedSize; + total_region_size = region_size; + } ClearCodeSpace(); - SetCodePtr(pageAligned); - - memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); - for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) @@ -118,7 +123,7 @@ Compiler::Compiler() SetJumpTarget(und); MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); RET(); - } + } { // RSCRATCH mode // ABI_PARAM2 reg n @@ -163,7 +168,10 @@ Compiler::Compiler() RET(); } - ResetStart = (void*)GetWritableCodePtr(); + // move the region forward to prevent overwriting the generated functions + region_size -= GetWritableCodePtr() - region; + total_region_size = region_size; + region = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -338,7 +346,7 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { void Compiler::Reset() { - SetCodePtr((u8*)ResetStart); + ClearCodeSpace(); } CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) @@ -375,9 +383,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; - if (!Thumb) - instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; - if (comp == NULL || i == instrsCount - 1) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index cd58012..0ce7d8d 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -132,7 +132,6 @@ public: return Gen::R(RegCache.Mapping[reg]); } - void* ResetStart; void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9401220..10428aa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,19 +30,22 @@ add_library(core STATIC SPU.cpp Wifi.cpp WifiAP.cpp +) - ARMJIT.cpp - ARMJIT_x64/ARMJIT_Compiler.cpp - ARMJIT_x64/ARMJIT_ALU.cpp - ARMJIT_x64/ARMJIT_LoadStore.cpp - ARMJIT_x64/ARMJIT_Branch.cpp +if (ENABLE_JIT) + target_sources(core PRIVATE + ARMJIT.cpp + ARMJIT_x64/ARMJIT_Compiler.cpp + ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp + ARMJIT_x64/ARMJIT_Branch.cpp - dolphin/CommonFuncs.cpp - dolphin/x64ABI.cpp - dolphin/x64CPUDetect.cpp - dolphin/x64Emitter.cpp - dolphin/MemoryUtil.cpp -) + dolphin/CommonFuncs.cpp + dolphin/x64ABI.cpp + dolphin/x64CPUDetect.cpp + dolphin/x64Emitter.cpp + ) +endif() if (WIN32) target_link_libraries(core ole32 comctl32 ws2_32 opengl32) diff --git a/src/CP15.cpp b/src/CP15.cpp index f232bec..e6e91c3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -812,7 +812,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -834,7 +836,9 @@ void ARMv5::DataWrite16(u32 addr, u16 val) { DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -856,8 +860,10 @@ void ARMv5::DataWrite32(u32 addr, u32 val) { DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) @@ -879,8 +885,10 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2] = NULL; - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) / 2 + 1] = NULL; +#ifdef JIT_ENABLED + ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; +#endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) diff --git a/src/Config.cpp b/src/Config.cpp index 37b701c..3cff0ed 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -34,8 +34,10 @@ int Threaded3D; int GL_ScaleFactor; int GL_Antialias; +#ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +#endif ConfigEntry ConfigFile[] = { @@ -45,8 +47,10 @@ ConfigEntry ConfigFile[] = {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, NULL, 0}, {"GL_Antialias", 0, &GL_Antialias, 0, NULL, 0}, +#ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, +#endif {"", -1, NULL, 0, NULL, 0} }; diff --git a/src/Config.h b/src/Config.h index 18a7910..c13eae3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -46,8 +46,10 @@ extern int Threaded3D; extern int GL_ScaleFactor; extern int GL_Antialias; +#ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +#endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 4b50d9c..62a52aa 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -162,7 +162,9 @@ bool Init() ARM9 = new ARMv5(); ARM7 = new ARMv4(); +#ifdef JIT_ENABLED ARMJIT::Init(); +#endif DMAs[0] = new DMA(0, 0); DMAs[1] = new DMA(0, 1); @@ -194,7 +196,9 @@ void DeInit() delete ARM9; delete ARM7; +#ifdef JIT_ENABLED ARMJIT::DeInit(); +#endif for (int i = 0; i < 8; i++) delete DMAs[i]; @@ -524,7 +528,9 @@ void Reset() KeyCnt = 0; RCnt = 0; +#ifdef JIT_ENABLED ARMJIT::InvalidateBlockCache(); +#endif NDSCart::Reset(); GBACart::Reset(); @@ -741,10 +747,12 @@ bool DoSavestate(Savestate* file) GPU::SetPowerCnt(PowerControl9); } +#ifdef JIT_ENABLED if (!file->Saving) { ARMJIT::InvalidateBlockCache(); } +#endif return true; } @@ -864,9 +872,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM9->ExecuteJIT(); else +#endif ARM9->Execute(); } @@ -889,9 +899,11 @@ u32 RunFrame() } else { +#ifdef JIT_ENABLED if (EnableJIT) ARM7->ExecuteJIT(); else +#endif ARM7->Execute(); } @@ -924,9 +936,11 @@ u32 RunFrame() u32 RunFrame() { +#ifdef JIT_ENABLED if (Config::JIT_Enable) return RunFrame(); else +#endif return RunFrame(); } @@ -1849,7 +1863,9 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -1901,7 +1917,9 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(0, addr); +#endif switch (addr & 0xFF000000) { @@ -1969,7 +1987,9 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(0, addr); +#endif switch (addr & 0xFF000000) { @@ -2264,7 +2284,9 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2325,7 +2347,9 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate16(1, addr); +#endif switch (addr & 0xFF800000) { @@ -2396,7 +2420,9 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { +#ifdef JIT_ENABLED ARMJIT::Invalidate32(1, addr); +#endif switch (addr & 0xFF800000) { diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h index 31a8d93..e71cf6d 100644 --- a/src/dolphin/CodeBlock.h +++ b/src/dolphin/CodeBlock.h @@ -9,7 +9,6 @@ #include "Assert.h" #include "../types.h" -#include "MemoryUtil.h" namespace Common { @@ -41,8 +40,6 @@ public: CodeBlock() = default; virtual ~CodeBlock() { - if (region) - FreeCodeSpace(); } CodeBlock(const CodeBlock&) = delete; CodeBlock& operator=(const CodeBlock&) = delete; diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 116d2da..46f5f9f 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -38,8 +38,10 @@ uiWindow* win; uiCheckbox* cbDirectBoot; +#ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +#endif int OnCloseWindow(uiWindow* window, void* blarg) { @@ -57,13 +59,17 @@ void OnOk(uiButton* btn, void* blarg) { Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); +#ifdef JIT_ENABLED Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled); - long blockSize = strtol(uiEntryText(enJITMaxBlockSize), NULL, 10); + char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); + long blockSize = strtol(maxBlockSizeStr, NULL, 10); + uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; Config::JIT_MaxBlockSize = blockSize; +#endif Config::Save(); @@ -73,6 +79,7 @@ void OnOk(uiButton* btn, void* blarg) ApplyNewSettings(4); } +#ifdef JIT_ENABLED void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) @@ -80,6 +87,7 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg) else uiControlDisable(uiControl(enJITMaxBlockSize)); } +#endif void Open() { @@ -90,7 +98,7 @@ void Open() } opened = true; - win = uiNewWindow("Emu settings - melonDS", 300, 170, 0, 0, 0); + win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0); uiWindowSetMargined(win, 1); uiWindowOnClosing(win, OnCloseWindow, NULL); @@ -105,6 +113,7 @@ void Open() uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0); } +#ifdef JIT_ENABLED { uiLabel* dummy = uiNewLabel(""); uiBoxAppend(top, uiControl(dummy), 0); @@ -133,6 +142,12 @@ void Open() uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } } +#endif + + { + uiLabel* dummy = uiNewLabel(""); + uiBoxAppend(top, uiControl(dummy), 0); + } { uiBox* in_ctrl = uiNewHorizontalBox(); @@ -153,6 +168,7 @@ void Open() uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot); +#ifdef JIT_ENABLED uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable); { char maxBlockSizeStr[10]; @@ -160,6 +176,7 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); +#endif uiControlShow(uiControl(win)); } diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index d6809c3..af05d7a 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2411,8 +2411,10 @@ void ApplyNewSettings(int type) } else if (type == 4) { +#ifdef JIT_ENABLED if (Config::JIT_Enable) ARMJIT::InvalidateBlockCache(); +#endif } EmuRunning = prevstatus; -- cgit v1.2.3 From 8ddc4d5904bafa72a6822bb2f487c9d7f100eb16 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 15 Jul 2019 19:17:10 +0200 Subject: jit: fix BLX_reg with rn=lr --- src/ARMJIT_x64/ARMJIT_Branch.cpp | 3 ++- src/ARM_InstrInfo.cpp | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 05c8ec6..1f95a90 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -187,9 +187,10 @@ void Compiler::A_Comp_BranchImm() void Compiler::A_Comp_BranchXchangeReg() { OpArg rn = MapReg(CurInstr.A_Reg(0)); + MOV(32, R(RSCRATCH), rn); if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg MOV(32, MapReg(14), Imm32(R15 - 4)); - Comp_JumpTo(rn.GetSimpleReg()); + Comp_JumpTo(RSCRATCH); } void Compiler::T_Comp_BCOND() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index b8dff00..c36d6c1 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -359,10 +359,7 @@ Info Decode(bool thumb, u32 num, u32 instr) } if (data & A_Link) - { res.DstRegs |= 1 << 14; - res.SrcRegs |= 1 << 15; - } if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right -- cgit v1.2.3 From 54985be1573710ae39f3c485141b8cbfd3bdf64c Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 15 Jul 2019 20:34:08 +0200 Subject: jit: LDM/STM keep proper stack alignment --- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 15a40f8..ee0a7af 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -480,11 +480,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + // we need to make sure that the stack stays aligned to 16 bytes + u32 stackAlloc = ((regsCount + 1) & ~1) * 8; + MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); CALL(Num == 0 @@ -508,7 +511,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc POP(ABI_PARAM3); CALL(WriteBanked); FixupBranch sucessfulWritten = J_CC(CC_NC); - if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg)) + if (RegCache.Mapping[reg] != INVALID_REG) MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); SaveReg(reg, ABI_PARAM3); SetJumpTarget(sucessfulWritten); @@ -529,6 +532,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } } + if (regsCount & 1) + POP(RSCRATCH); + if (regs[15]) { if (Num == 1) @@ -543,6 +549,9 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + if (regsCount & 1) + PUSH(RSCRATCH); + bool firstUserMode = true; for (int reg : regs) { @@ -572,6 +581,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc PUSH(MapReg(reg).GetSimpleReg()); } } + MOV(64, R(ABI_PARAM2), R(RSP)); MOV(32, R(ABI_PARAM3), Imm32(regsCount)); @@ -579,7 +589,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ? MemoryFuncsSeq9[1][preinc] : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); - ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); } return offset; -- cgit v1.2.3 From be8846e31a80bef098cfa03cef5748d3d8011715 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Wed, 17 Jul 2019 03:18:37 +0200 Subject: jit: fix misc static branch things --- src/ARMJIT_x64/ARMJIT_Branch.cpp | 27 +++++++++++++++++++++++---- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 15 ++++++++++----- src/ARM_InstrInfo.cpp | 11 ++++------- 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 1f95a90..6ae4aad 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -35,6 +35,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) u32 newregion = addr >> 24; u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; + u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); @@ -53,7 +54,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if (addr & 0x2) { nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16; - cycles += CurCPU->CodeCycles; + cycles += cpu9->CodeCycles; nextInstr[1] = cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } @@ -61,7 +62,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { nextInstr[0] = cpu9->CodeRead32(addr, true); nextInstr[1] = nextInstr[0] >> 16; - cycles += CurCPU->CodeCycles; + cycles += cpu9->CodeCycles; } } else @@ -74,6 +75,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) nextInstr[1] = cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } + + cpu9->RegionCodeCycles = compileTimeCodeCycles; + if (setupRegion) + cpu9->SetupCodeMem(R15); } else { @@ -86,26 +91,40 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = codeCycles; MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); if (addr & 0x1) { addr &= ~0x1; newPC = addr+2; + // this is necessary because ARM7 bios protection + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr); nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2); cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; + + CurCPU->R[15] = compileTimePC; } else { addr &= ~0x3; newPC = addr+4; + u32 compileTimePC = CurCPU->R[15]; + CurCPU->R[15] = newPC; + nextInstr[0] = cpu7->CodeRead32(addr); nextInstr[1] = cpu7->CodeRead32(addr+4); cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; + + CurCPU->R[15] = compileTimePC; } + + cpu7->CodeRegion = R15 >> 24; + cpu7->CodeCycles = addr >> 15; } MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); @@ -204,7 +223,7 @@ void Compiler::T_Comp_BCOND() FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 18cb27e..1e871fd 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -354,8 +354,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (IsAlmostFull()) InvalidateBlockCache(); - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - ConstantCycles = 0; Thumb = cpu->CPSR & 0x20; Num = cpu->Num; @@ -363,6 +361,13 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CodeRegion = cpu->CodeRegion; CurCPU = cpu; + CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); + + if (!IsMapped(Num, R15 - Thumb ? 2 : 4)) + { + printf("Trying to compile a block in unmapped memory\n"); + } + bool mergedThumbBL = false; ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -383,7 +388,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; - if (comp == NULL || i == instrsCount - 1) + bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; + if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); @@ -454,10 +460,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs else (this->*comp)(); - FixupBranch skipFailed; if (CurInstr.Cond() < 0xE) { - skipFailed = J(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index c36d6c1..5db2471 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -178,7 +178,6 @@ enum { T_ReadR13 = 1 << 9, T_WriteR13 = 1 << 10, - T_ReadR15 = 1 << 11, T_BranchAlways = 1 << 12, T_ReadR14 = 1 << 13, @@ -222,7 +221,7 @@ const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG); const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG); -const u32 T_ADD_PCREL = T_Write8 | T_ReadR15 | tk(tk_ADD_PCREL); +const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP); @@ -257,11 +256,11 @@ const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); const u32 T_BLX_REG = T_BranchAlways | T_WriteR14 | T_ReadHi3 | tk(tk_BLX_REG); const u32 T_B = T_BranchAlways | tk(tk_B); -const u32 T_BL_LONG_1 = T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_1); -const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | T_ReadR15 | tk(tk_BL_LONG_2); +const u32 T_BL_LONG_1 = T_WriteR14 | tk(tk_BL_LONG_1); +const u32 T_BL_LONG_2 = T_BranchAlways | T_ReadR14 | T_WriteR14 | tk(tk_BL_LONG_2); const u32 T_UNK = T_BranchAlways | T_WriteR14 | tk(tk_UNK); -const u32 T_SVC = T_BranchAlways | T_WriteR14 | T_ReadR15 | tk(tk_SVC); +const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC); #define INSTRFUNC_PROTO(x) u32 x #include "ARM_InstrTable.h" @@ -299,8 +298,6 @@ Info Decode(bool thumb, u32 num, u32 instr) res.SrcRegs |= (1 << 13); if (data & T_WriteR13) res.DstRegs |= (1 << 13); - if (data & T_ReadR15) - res.SrcRegs |= (1 << 15); if (data & T_WriteR14) res.DstRegs |= (1 << 14); if (data & T_ReadR14) -- cgit v1.2.3 From 9d180c7bbc8ccb3459ab2ab14dd2adc7a0f71cf3 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 21 Jul 2019 13:36:48 +0200 Subject: jit: decrease blockcache AddrMapping size for ARM9 --- src/ARM.cpp | 8 ++--- src/ARMJIT.cpp | 18 ++++++---- src/ARMJIT.h | 67 ++++++++++++++++++++++++++++---------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++- src/NDS.cpp | 12 +++---- 5 files changed, 74 insertions(+), 35 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index eb58d02..b68b5eb 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -566,14 +566,14 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped(0, instrAddr)) + if (!ARMJIT::IsMapped<0>(instrAddr)) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, instrAddr); + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); Cycles += (block ? block : ARMJIT::CompileBlock(this))(); if (Halted) @@ -697,13 +697,13 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped(1, instrAddr)) + if (!ARMJIT::IsMapped<1>(instrAddr)) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, instrAddr); + ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); Cycles += (block ? block : ARMJIT::CompileBlock(this))(); // TODO optimize this shit!!! diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index e8e6be0..aad14c0 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -109,11 +109,14 @@ void Init() { memset(&cache, 0, sizeof(BlockCache)); - for (int cpu = 0; cpu < 2; cpu++) - for (int i = 0; i < 0x4000; i++) - cache.AddrMapping[cpu][i] = JIT_MEM[cpu][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[cpu][i >> 9]) - + (((i << 14) & JIT_MASK[cpu][i >> 9]) >> 1); + for (int i = 0; i < 0x2000; i++) + cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : + (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) + + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + for (int i = 0; i < 0x4000; i++) + cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : + (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) + + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); compiler = new Compiler(); } @@ -175,7 +178,10 @@ CompiledBlock CompileBlock(ARM* cpu) CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); - InsertBlock(cpu->Num, blockAddr, block); + if (cpu->Num == 0) + InsertBlock<0>(blockAddr, block); + else + InsertBlock<1>(blockAddr, block); return block; } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 004256c..0fc1c38 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -47,9 +47,11 @@ struct FetchedInstr a function which executes a block instructions starting from there. The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x4000 16 KB blocks, each of which a pointer to the relevant - place inside the before mentioned arrays. Only half of the bytes need to be - addressed (ARM address are aligned to 4, Thumb addresses to a 2 byte boundary). + divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which + a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB + are the sizes of the smallest contigous memory region mapped to the respective CPU. + Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, + we only need every second half word to be adressable. In case a memory write hits mapped memory, the function block at this address is set to null, so it's recompiled the next time it's executed. @@ -61,7 +63,8 @@ struct FetchedInstr struct BlockCache { - CompiledBlock* AddrMapping[2][0x4000] = {0}; + CompiledBlock* AddrMapping9[0x2000] = {0}; + CompiledBlock* AddrMapping7[0x4000] = {0}; CompiledBlock MainRAM[4*1024*1024/2]; CompiledBlock SWRAM[0x8000/2]; // Shared working RAM @@ -75,35 +78,63 @@ struct BlockCache extern BlockCache cache; -inline bool IsMapped(u32 num, u32 addr) +template +inline bool IsMapped(u32 addr) { - return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14]; + if (num == 0) + return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + else + return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; } -inline CompiledBlock LookUpBlock(u32 num, u32 addr) +template +inline CompiledBlock LookUpBlock(u32 addr) { - return cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + if (num == 0) + return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + else + return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; } -inline void Invalidate16(u32 num, u32 addr) +template +inline void Invalidate16(u32 addr) { - if (IsMapped(num, addr)) - cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; + if (IsMapped(addr)) + { + if (num == 0) + cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; + else + cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; + } } -inline void Invalidate32(u32 num, u32 addr) +template +inline void Invalidate32(u32 addr) { - if (IsMapped(num, addr)) + if (IsMapped(addr)) { - CompiledBlock* page = cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; + if (num == 0) + { + CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + page[(addr & 0x7FFF) >> 1] = NULL; + page[((addr + 2) & 0x7FFF) >> 1] = NULL; + } + else + { + CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + page[(addr & 0x3FFF) >> 1] = NULL; + page[((addr + 2) & 0x3FFF) >> 1] = NULL; + } } } -inline void InsertBlock(u32 num, u32 addr, CompiledBlock func) +template +inline void InsertBlock(u32 addr, CompiledBlock func) { - cache.AddrMapping[num][(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + if (num == 0) + cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; + else + cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; } void Init(); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 1e871fd..cb11f73 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -363,7 +363,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - if (!IsMapped(Num, R15 - Thumb ? 2 : 4)) + if (!(Num == 0 + ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) + : IsMapped<1>(R15 - (Thumb ? 2 : 4)))) { printf("Trying to compile a block in unmapped memory\n"); } diff --git a/src/NDS.cpp b/src/NDS.cpp index 62a52aa..cab78b5 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1864,7 +1864,7 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16(0, addr); + ARMJIT::Invalidate16<0>(addr); #endif switch (addr & 0xFF000000) @@ -1918,7 +1918,7 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16(0, addr); + ARMJIT::Invalidate16<0>(addr); #endif switch (addr & 0xFF000000) @@ -1988,7 +1988,7 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32(0, addr); + ARMJIT::Invalidate32<0>(addr); #endif switch (addr & 0xFF000000) @@ -2285,7 +2285,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16(1, addr); + ARMJIT::Invalidate16<1>(addr); #endif switch (addr & 0xFF800000) @@ -2348,7 +2348,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16(1, addr); + ARMJIT::Invalidate16<1>(addr); #endif switch (addr & 0xFF800000) @@ -2421,7 +2421,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32(1, addr); + ARMJIT::Invalidate32<1>(addr); #endif switch (addr & 0xFF800000) -- cgit v1.2.3 From 4a0f6b3b4bd60815d0c8259e4ec2a944bfb716be Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 21 Jul 2019 17:28:16 +0200 Subject: jit: fix thumb hi reg alu and mcr halt + mcr/mrc aren't always, msr_imm is never unk on ARM7 --- src/ARMJIT.cpp | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +--- src/ARMJIT_x64/ARMJIT_Branch.cpp | 21 ++++++++++++++------- src/ARM_InstrInfo.cpp | 33 ++++++++++++++++++++++++++++----- src/ARM_InstrInfo.h | 1 + 5 files changed, 45 insertions(+), 16 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index aad14c0..6948eee 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -174,7 +174,7 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; - } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 013f54c..bdf06f7 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg() switch (op) { case 0x0: // ADD - Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric); break; case 0x1: // CMP Comp_CmpOp(2, rdMapped, rs, false); @@ -671,8 +671,6 @@ void Compiler::T_Comp_ALU_HiReg() case 0x2: // MOV if (rdMapped != rs) MOV(32, rdMapped, rs); - TEST(32, rdMapped, rdMapped); - Comp_RetriveFlags(false, false, false); break; } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 6ae4aad..9d4c1e2 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -235,16 +235,23 @@ void Compiler::T_Comp_B() void Compiler::T_Comp_BranchXchangeReg() { bool link = CurInstr.Instr & (1 << 7); - if (link && Num == 1) - { - printf("BLX unsupported on ARM7!!!\n"); - return; - } - OpArg rn = MapReg(CurInstr.A_Reg(3)); if (link) + { + if (Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3))); MOV(32, MapReg(14), Imm32(R15 - 1)); - Comp_JumpTo(rn.GetSimpleReg()); + Comp_JumpTo(RSCRATCH); + } + else + { + OpArg rn = MapReg(CurInstr.A_Reg(3)); + Comp_JumpTo(rn.GetSimpleReg()); + } } void Compiler::T_Comp_BL_LONG_1() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 5db2471..b70c8dc 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -152,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); -const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM); -const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG); -const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS); -const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR); -const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC); +const u32 A_MSR_IMM = ak(ak_MSR_IMM); +const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG); +const u32 A_MRS = A_Write12 | ak(ak_MRS); +const u32 A_MCR = A_Read12 | ak(ak_MCR); +const u32 A_MRC = A_Write12 | ak(ak_MRC); const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB @@ -310,6 +310,7 @@ Info Decode(bool thumb, u32 num, u32 instr) res.DstRegs |= 1 << 15; res.Kind = (data >> 16) & 0x3F; + res.EndBlock = res.Branches(); return res; } @@ -324,6 +325,26 @@ Info Decode(bool thumb, u32 num, u32 instr) res.Kind = (data >> 13) & 0x1FF; + if (res.Kind == ak_MCR) + { + u32 cn = (instr >> 16) & 0xF; + u32 cm = instr & 0xF; + u32 cpinfo = (instr >> 5) & 0x7; + u32 id = (cn<<8)|(cm<<4)|cpinfo; + if (id == 0x704 || id == 0x782) + res.EndBlock |= true; + } + if (res.Kind == ak_MCR || res.Kind == ak_MRC) + { + u32 cp = ((instr >> 8) & 0xF); + if ((num == 0 && cp != 15) || (num == 1 && cp != 14)) + { + printf("happens\n"); + data = A_UNK; + res.Kind = ak_UNK; + } + } + if (data & A_Read0) res.SrcRegs |= 1 << (instr & 0xF); if (data & A_Read16) @@ -361,6 +382,8 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right + res.EndBlock |= res.Branches(); + return res; } } diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 51dcfa2..4fe9b10 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -220,6 +220,7 @@ struct Info u16 DstRegs, SrcRegs; u16 Kind; + bool EndBlock; bool Branches() { return DstRegs & (1 << 15); -- cgit v1.2.3 From f31976fed0c0c61e403ccaee5154c1f25d24d60d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 22 Jul 2019 01:04:42 +0200 Subject: jit: fix RSC --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index bdf06f7..368fd8b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -181,7 +181,7 @@ void Compiler::A_Comp_Arith() Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); break; case 0x6: // SBC - Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry); break; case 0x7: // RSC Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); -- cgit v1.2.3 From 5e443e79625b66daf15350d68921d74673cb5232 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 16 Aug 2019 23:17:08 +0200 Subject: remove unneeded dolphin code, C++11 static_assert --- src/ARMJIT.cpp | 2 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 19 ++++---- src/ARMJIT_x64/ARMJIT_Compiler.h | 5 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 2 +- src/dolphin/Assert.h | 47 ------------------- src/dolphin/CodeBlock.h | 91 ------------------------------------- src/dolphin/Compat.h | 63 +++++++++++++++++++++++++ src/dolphin/Intrinsics.h | 72 ----------------------------- src/dolphin/Log.h | 21 --------- src/dolphin/x64CPUDetect.cpp | 1 - src/dolphin/x64Emitter.cpp | 3 +- src/dolphin/x64Emitter.h | 13 +----- 13 files changed, 84 insertions(+), 259 deletions(-) delete mode 100644 src/dolphin/Assert.h delete mode 100644 src/dolphin/CodeBlock.h create mode 100644 src/dolphin/Compat.h delete mode 100644 src/dolphin/Intrinsics.h delete mode 100644 src/dolphin/Log.h (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 6948eee..74554d7 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -188,6 +188,8 @@ CompiledBlock CompileBlock(ARM* cpu) void InvalidateBlockCache() { + printf("Resetting JIT block cache...\n"); + memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 368fd8b..f0bcf8e 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -257,7 +257,7 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1); } - static_assert(EAX == RSCRATCH); + static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!"); MOV(32, R(RSCRATCH), rm); if (add) { @@ -383,7 +383,7 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b } MOV(32, R(RSCRATCH), rm); - static_assert(RSCRATCH3 == ECX); + static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3"); MOV(32, R(ECX), rs); AND(32, R(ECX), Imm32(0xFF)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index cb11f73..0fbcfda 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -63,12 +63,11 @@ Compiler::Compiler() mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); #endif - region = pageAligned; - region_size = alignedSize; - total_region_size = region_size; + ResetStart = pageAligned; + CodeMemSize = alignedSize; } - ClearCodeSpace(); + Reset(); for (int i = 0; i < 3; i++) { @@ -169,9 +168,8 @@ Compiler::Compiler() } // move the region forward to prevent overwriting the generated functions - region_size -= GetWritableCodePtr() - region; - total_region_size = region_size; - region = GetWritableCodePtr(); + CodeMemSize -= GetWritableCodePtr() - ResetStart; + ResetStart = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -208,7 +206,7 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) { if (cond >= 0x8) { - static_assert(RSCRATCH3 == ECX); + static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); MOV(32, R(RSCRATCH3), R(RCPSR)); SHR(32, R(RSCRATCH3), Imm8(28)); MOV(32, R(RSCRATCH), Imm32(1)); @@ -346,12 +344,13 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { void Compiler::Reset() { - ClearCodeSpace(); + memset(ResetStart, 0xcc, CodeMemSize); + SetCodePtr(ResetStart); } CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { - if (IsAlmostFull()) + if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... InvalidateBlockCache(); ConstantCycles = 0; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 0ce7d8d..3151cbc 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -17,7 +17,7 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; -class Compiler : public Gen::X64CodeBlock +class Compiler : public Gen::XEmitter { public: Compiler(); @@ -132,6 +132,9 @@ public: return Gen::R(RegCache.Mapping[reg]); } + u8* ResetStart; + u32 CodeMemSize; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index ee0a7af..6386f8b 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -171,7 +171,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } RET(); - static_assert(RSCRATCH == EAX); + static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!"); return res; } diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h deleted file mode 100644 index 4eb16e0..0000000 --- a/src/dolphin/Assert.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2015 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#include - -#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \ - assert(_a_) \ - /*do \ - { \ - if (!(_a_)) \ - { \ - if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \ - Crash(); \ - } \ - } while (0)*/ - -#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \ - assert(_a_); \ - /*do \ - { \ - if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \ - { \ - ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \ - if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \ - Crash(); \ - } \ - } while (0)*/ - -#define ASSERT(_a_) \ - assert(_a_) \ - /*do \ - { \ - ASSERT_MSG(MASTER_LOG, _a_, \ - _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \ - __LINE__, __FILE__); \ - } while (0)*/ - -#define DEBUG_ASSERT(_a_) \ - assert(_a_) \ - /*do \ - { \ - if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \ - ASSERT(_a_); \ - } while (0)*/ diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h deleted file mode 100644 index e71cf6d..0000000 --- a/src/dolphin/CodeBlock.h +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2014 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#include -#include - -#include "Assert.h" -#include "../types.h" - -namespace Common -{ -// Everything that needs to generate code should inherit from this. -// You get memory management for free, plus, you can use all emitter functions without -// having to prefix them with gen-> or something similar. -// Example implementation: -// class JIT : public CodeBlock {} -template -class CodeBlock : public T -{ -private: - // A privately used function to set the executable RAM space to something invalid. - // For debugging usefulness it should be used to set the RAM to a host specific breakpoint - // instruction - virtual void PoisonMemory() = 0; - -protected: - u8* region = nullptr; - // Size of region we can use. - size_t region_size = 0; - // Original size of the region we allocated. - size_t total_region_size = 0; - - bool m_is_child = false; - std::vector m_children; - -public: - CodeBlock() = default; - virtual ~CodeBlock() - { - } - CodeBlock(const CodeBlock&) = delete; - CodeBlock& operator=(const CodeBlock&) = delete; - CodeBlock(CodeBlock&&) = delete; - CodeBlock& operator=(CodeBlock&&) = delete; - - // Always clear code space with breakpoints, so that if someone accidentally executes - // uninitialized, it just breaks into the debugger. - void ClearCodeSpace() - { - PoisonMemory(); - ResetCodePtr(); - } - - bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); } - - void ResetCodePtr() { T::SetCodePtr(region); } - size_t GetSpaceLeft() const - { - ASSERT(static_cast(T::GetCodePtr() - region) < region_size); - return region_size - (T::GetCodePtr() - region); - } - - bool IsAlmostFull() const - { - // This should be bigger than the biggest block ever. - return GetSpaceLeft() < 0x10000; - } - - bool HasChildren() const { return region_size != total_region_size; } - u8* AllocChildCodeSpace(size_t child_size) - { - ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation."); - u8* child_region = region + region_size - child_size; - region_size -= child_size; - return child_region; - } - void AddChildCodeSpace(CodeBlock* child, size_t child_size) - { - u8* child_region = AllocChildCodeSpace(child_size); - child->m_is_child = true; - child->region = child_region; - child->region_size = child_size; - child->total_region_size = child_size; - child->ResetCodePtr(); - m_children.emplace_back(child); - } -}; -} // namespace Common diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h new file mode 100644 index 0000000..f2f52a5 --- /dev/null +++ b/src/dolphin/Compat.h @@ -0,0 +1,63 @@ +// Stubs for Assert.h and Log.h +#pragma once + +#include + +// Assert stub +#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \ + assert(_a_) \ + /*do \ + { \ + if (!(_a_)) \ + { \ + if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \ + assert(_a_); \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \ + { \ + ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \ + if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + ASSERT_MSG(MASTER_LOG, _a_, \ + _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \ + __LINE__, __FILE__); \ + } while (0)*/ + +#define DEBUG_ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \ + ASSERT(_a_); \ + } while (0)*/ + +// Log Stub +#include + +#define PanicAlert(fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + abort(); \ + } while (false) + +#define DYNA_REC 0 + +#define ERROR_LOG(which, fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + } while (false) diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h deleted file mode 100644 index 483f219..0000000 --- a/src/dolphin/Intrinsics.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2015 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#if defined(_M_X86) - -/** - * It is assumed that all compilers used to build Dolphin support intrinsics up to and including - * SSE 4.2 on x86/x64. - */ - -#if defined(__GNUC__) || defined(__clang__) - -/** - * Due to limitations in GCC, SSE intrinsics are only available when compiling with the - * corresponding instruction set enabled. However, using the target attribute, we can compile - * single functions with a different target instruction set, while still creating a generic build. - * - * Since this instruction set is enabled per-function, any callers should verify that the - * instruction set is supported at runtime before calling it, and provide a fallback implementation - * when not supported. - * - * When building with -march=native, or enabling the instruction sets in the compile flags, permit - * usage of the instrinsics without any function attributes. If the command-line architecture does - * not support this instruction set, enable it via function targeting. - */ - -#include -#ifndef __SSE4_2__ -#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]] -#endif -#ifndef __SSE4_1__ -#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]] -#endif -#ifndef __SSSE3__ -#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]] -#endif -#ifndef __SSE3__ -#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]] -#endif - -#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) - -/** - * MSVC and ICC support intrinsics for any instruction set without any function attributes. - */ -#include - -#endif // defined(_MSC_VER) || defined(__INTEL_COMPILER) - -#endif // _M_X86 - -/** - * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform. - * This way when a function is defined with FUNCTION_TARGET you don't need to define a second - * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use - * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures. - */ -#ifndef FUNCTION_TARGET_SSE42 -#define FUNCTION_TARGET_SSE42 -#endif -#ifndef FUNCTION_TARGET_SSR41 -#define FUNCTION_TARGET_SSR41 -#endif -#ifndef FUNCTION_TARGET_SSSE3 -#define FUNCTION_TARGET_SSSE3 -#endif -#ifndef FUNCTION_TARGET_SSE3 -#define FUNCTION_TARGET_SSE3 -#endif diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h deleted file mode 100644 index a7f4b6a..0000000 --- a/src/dolphin/Log.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include "CommonFuncs.h" - -#include - -#define PanicAlert(fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - abort(); \ - } while (false) - - -#define DYNA_REC 0 - -#define ERROR_LOG(which, fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - } while (false) diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp index 05ee11c..49b51c9 100644 --- a/src/dolphin/x64CPUDetect.cpp +++ b/src/dolphin/x64CPUDetect.cpp @@ -7,7 +7,6 @@ #include "CPUDetect.h" #include "../types.h" -#include "Intrinsics.h" #ifndef _MSVC_VER diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp index 7849624..343f314 100644 --- a/src/dolphin/x64Emitter.cpp +++ b/src/dolphin/x64Emitter.cpp @@ -7,9 +7,10 @@ #include "CPUDetect.h" #include "../types.h" -#include "Log.h" #include "x64Emitter.h" #include "x64Reg.h" +#include "Compat.h" +#include "CommonFuncs.h" namespace Gen { diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h index 122850d..869acb6 100644 --- a/src/dolphin/x64Emitter.h +++ b/src/dolphin/x64Emitter.h @@ -12,9 +12,8 @@ #include #include -#include "Assert.h" +#include "Compat.h" #include "BitSet.h" -#include "CodeBlock.h" #include "../types.h" #include "x64ABI.h" @@ -1167,14 +1166,4 @@ public: } }; // class XEmitter -class X64CodeBlock : public Common::CodeBlock -{ -private: - void PoisonMemory() override - { - // x86/64: 0xCC = breakpoint - memset(region, 0xCC, region_size); - } -}; - } // namespace -- cgit v1.2.3 From 3001d9492c6e7e83e82843a4b9c6186b0b58f5e5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 17 Aug 2019 16:50:48 +0200 Subject: abandon pipelining on jit fixes Golden Sun Dawn this makes the cpu state incompatible between interpreter and JIT. That's why switching cpu mode requires a restart(not requiring is stupid anyway) and the pipeline is manually filled when making a save state. --- src/ARM.cpp | 46 +++++++++++++++++++++++++++++- src/ARM.h | 6 ++++ src/ARMJIT.cpp | 1 + src/ARMJIT_x64/ARMJIT_Branch.cpp | 39 +++++++++++-------------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 5 ---- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 5 ---- src/libui_sdl/DlgEmuSettings.cpp | 28 ++++++++++++++---- src/libui_sdl/libui/ui.h | 1 + src/libui_sdl/libui/windows/stddialogs.cpp | 17 +++++++++-- src/libui_sdl/main.cpp | 16 +++++------ 10 files changed, 116 insertions(+), 48 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 868c287..e404943 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -22,6 +22,7 @@ #include "ARMInterpreter.h" #include "AREngine.h" #include "ARMJIT.h" +#include "Config.h" // instruction timing notes @@ -122,6 +123,13 @@ void ARM::DoSavestate(Savestate* file) file->VarArray(R_IRQ, 3*sizeof(u32)); file->VarArray(R_UND, 3*sizeof(u32)); file->Var32(&CurInstr); + if (!file->Saving && Config::JIT_Enable) + { + // hack, the JIT doesn't really pipeline + // but we still want JIT save states to be + // loaded while running the interpreter + FillPipeline(); + } file->VarArray(NextInstr, 2*sizeof(u32)); file->Var32(&ExceptionBase); @@ -724,4 +732,40 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; } -#endif \ No newline at end of file +#endif + +void ARMv5::FillPipeline() +{ + if (CPSR & 0x20) + { + if ((R[15] - 2) & 0x2) + { + NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16; + NextInstr[1] = CodeRead32(R[15], false); + } + else + { + NextInstr[0] = CodeRead32(R[15] - 2, false); + NextInstr[1] = NextInstr[0] >> 16; + } + } + else + { + NextInstr[0] = CodeRead32(R[15] - 4, false); + NextInstr[1] = CodeRead32(R[15], false); + } +} + +void ARMv4::FillPipeline() +{ + if (CPSR & 0x20) + { + NextInstr[0] = CodeRead16(R[15] - 2); + NextInstr[1] = CodeRead16(R[15]); + } + else + { + NextInstr[0] = CodeRead32(R[15] - 4); + NextInstr[1] = CodeRead32(R[15]); + } +} \ No newline at end of file diff --git a/src/ARM.h b/src/ARM.h index ecdf5b4..4d387bc 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -42,6 +42,8 @@ public: virtual void DoSavestate(Savestate* file); + virtual void FillPipeline() = 0; + virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; void RestoreCPSR(); @@ -148,6 +150,8 @@ public: void UpdateRegionTimings(u32 addrstart, u32 addrend); + void FillPipeline(); + void JumpTo(u32 addr, bool restorecpsr = false); void PrefetchAbort(); @@ -272,6 +276,8 @@ class ARMv4 : public ARM public: ARMv4(); + void FillPipeline(); + void JumpTo(u32 addr, bool restorecpsr = false); void Execute(); diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 74554d7..949bc1c 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -139,6 +139,7 @@ CompiledBlock CompileBlock(ARM* cpu) int i = 0; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; do { diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 9d4c1e2..30b18d7 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -4,6 +4,14 @@ using namespace Gen; namespace ARMJIT { + +template +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { @@ -12,9 +20,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // we'll see how it works out u32 newPC; - u32 nextInstr[2]; u32 cycles = 0; - bool setupRegion = false; if (addr & 0x1 && !Thumb) { @@ -40,7 +46,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - setupRegion = newregion != oldregion; + bool setupRegion = newregion != oldregion; if (setupRegion) cpu9->SetupCodeMem(addr); @@ -53,15 +59,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { - nextInstr[0] = cpu9->CodeRead32(addr-2, true) >> 16; + cpu9->CodeRead32(addr-2, true); cycles += cpu9->CodeCycles; - nextInstr[1] = cpu9->CodeRead32(addr+2, false); + cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } else { - nextInstr[0] = cpu9->CodeRead32(addr, true); - nextInstr[1] = nextInstr[0] >> 16; + cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; } } @@ -70,12 +75,15 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) addr &= ~0x3; newPC = addr+4; - nextInstr[0] = cpu9->CodeRead32(addr, true); + cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; - nextInstr[1] = cpu9->CodeRead32(addr+4, false); + cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } + MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); + cpu9->RegionCodeCycles = compileTimeCodeCycles; if (setupRegion) cpu9->SetupCodeMem(R15); @@ -102,8 +110,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) u32 compileTimePC = CurCPU->R[15]; CurCPU->R[15] = newPC; - nextInstr[0] = ((ARMv4*)CurCPU)->CodeRead16(addr); - nextInstr[1] = ((ARMv4*)CurCPU)->CodeRead16(addr+2); cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1]; CurCPU->R[15] = compileTimePC; @@ -116,8 +122,6 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) u32 compileTimePC = CurCPU->R[15]; CurCPU->R[15] = newPC; - nextInstr[0] = cpu7->CodeRead32(addr); - nextInstr[1] = cpu7->CodeRead32(addr+4); cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3]; CurCPU->R[15] = compileTimePC; @@ -128,19 +132,10 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) } MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(nextInstr[0])); - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(nextInstr[1])); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); - - if (setupRegion) - { - MOV(64, R(ABI_PARAM1), R(RCPU)); - MOV(32, R(ABI_PARAM2), Imm32(newPC)); - CALL((void*)&ARMv5::SetupCodeMem); - } } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 0fbcfda..ab13cb6 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -395,11 +395,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (i == instrsCount - 1) - { - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0])); - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1])); - } if (comp == NULL) SaveCPSR(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 6386f8b..3b4cb7d 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -457,11 +457,6 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) } } -void printStuff2(u32 a, u32 b) -{ - printf("b %x %x\n", a, b); -} - s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { int regsCount = regs.Count(); diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 46f5f9f..2f5ee2d 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -29,6 +29,7 @@ void ApplyNewSettings(int type); +extern bool RunningSomething; namespace DlgEmuSettings { @@ -57,10 +58,10 @@ void OnCancel(uiButton* btn, void* blarg) void OnOk(uiButton* btn, void* blarg) { - Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); - #ifdef JIT_ENABLED - Config::JIT_Enable = uiCheckboxChecked(cbJITEnabled); + bool restart = false; + + bool enableJit = uiCheckboxChecked(cbJITEnabled); char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); uiFreeText(maxBlockSizeStr); @@ -68,15 +69,32 @@ void OnOk(uiButton* btn, void* blarg) blockSize = 1; if (blockSize > 32) blockSize = 32; - Config::JIT_MaxBlockSize = blockSize; + + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize) + { + if (RunningSomething && + !uiMsgBoxConfirm(win, "Reset emulator", + "Changing JIT settings requires a reset.\n\nDo you want to continue?")) + return; + + Config::JIT_Enable = enableJit; + Config::JIT_MaxBlockSize = Config::JIT_MaxBlockSize; + + restart = true; + } #endif + Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); + Config::Save(); uiControlDestroy(uiControl(win)); opened = false; - ApplyNewSettings(4); +#ifdef JIT_ENABLED + if (restart) + ApplyNewSettings(4); +#endif } #ifdef JIT_ENABLED diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h index 03aef5d..e45fe91 100644 --- a/src/libui_sdl/libui/ui.h +++ b/src/libui_sdl/libui/ui.h @@ -289,6 +289,7 @@ _UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* in _UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath); _UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description); _UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description); +_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description); typedef struct uiArea uiArea; typedef struct uiAreaHandler uiAreaHandler; diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp index d0fd506..7537015 100644 --- a/src/libui_sdl/libui/windows/stddialogs.cpp +++ b/src/libui_sdl/libui/windows/stddialogs.cpp @@ -136,7 +136,7 @@ char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath) // TODO switch to TaskDialogIndirect()? -static void msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon) +static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon) { WCHAR *wtitle, *wdescription; HRESULT hr; @@ -144,12 +144,15 @@ static void msgbox(HWND parent, const char *title, const char *description, TASK wtitle = toUTF16(title); wdescription = toUTF16(description); - hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, NULL); + int result; + hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result); if (hr != S_OK) logHRESULT(L"error showing task dialog", hr); uiFree(wdescription); uiFree(wtitle); + + return result; } void uiMsgBox(uiWindow *parent, const char *title, const char *description) @@ -165,3 +168,13 @@ void uiMsgBoxError(uiWindow *parent, const char *title, const char *description) msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON); enableAllWindowsExcept(parent); } + +int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description) +{ + disableAllWindowsExcept(parent); + int result = + msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON); + enableAllWindowsExcept(parent); + + return result == IDOK; +} \ No newline at end of file diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index af05d7a..0066668 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2355,6 +2355,14 @@ void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg) void ApplyNewSettings(int type) { +#ifdef JIT_ENABLED + if (type == 4) + { + Reset(NULL); + return; + } +#endif + if (!RunningSomething) { if (type == 1) return; @@ -2409,14 +2417,6 @@ void ApplyNewSettings(int type) GPU3D::InitRenderer(Screen_UseGL); if (Screen_UseGL) uiGLMakeContextCurrent(NULL); } - else if (type == 4) - { -#ifdef JIT_ENABLED - if (Config::JIT_Enable) - ARMJIT::InvalidateBlockCache(); -#endif - } - EmuRunning = prevstatus; } -- cgit v1.2.3 From 5ea91b8a039e0735ac5cb102e2375c26c4f7a150 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 25 Aug 2019 12:28:48 +0200 Subject: optimise away unneeded flag sets - especially useful for thumb code and larger max block sizes - can still be improved upon --- src/ARMJIT.cpp | 24 ++++ src/ARMJIT.h | 1 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 64 +++++++--- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 9 ++ src/ARMJIT_x64/ARMJIT_Compiler.h | 6 +- src/ARM_InstrInfo.cpp | 238 +++++++++++++++++++++++-------------- src/ARM_InstrInfo.h | 13 ++ src/libui_sdl/main.cpp | 2 + 8 files changed, 248 insertions(+), 109 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 949bc1c..3b6bc2e 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -126,6 +126,24 @@ void DeInit() delete compiler; } +void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +{ + for (int j = start; j >= 0; j--) + { + u8 match = instrs[j].Info.WriteFlags & flags; + u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags; + if (matchMaybe) // writes flags maybe + instrs[j].SetFlags |= matchMaybe; + if (match) + { + instrs[j].SetFlags |= match; + flags &= ~match; + if (!flags) + return; + } + } +} + CompiledBlock CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -175,8 +193,14 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; + + bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) + floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + floodFillSetFlags(instrs, i - 1, 0xF); + CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); if (cpu->Num == 0) diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 0fc1c38..6197695 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -28,6 +28,7 @@ struct FetchedInstr return Instr >> 28; } + u8 SetFlags; u32 Instr; u32 NextInstr[2]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f0bcf8e..6a7d711 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -111,6 +111,8 @@ OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) } else { + S = S && (CurInstr.SetFlags & 0x2); + int op = (CurInstr.Instr >> 5) & 0x3; if (CurInstr.Instr & (1 << 4)) { @@ -215,7 +217,8 @@ void Compiler::A_Comp_MovOp() if (S) { - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } @@ -263,12 +266,14 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O { IMUL(32, RSCRATCH, rs); LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg())); - TEST(32, rd, rd); + if (S && FlagsNZRequired()) + TEST(32, rd, rd); } else { IMUL(32, RSCRATCH, rs); MOV(32, rd, R(RSCRATCH)); + if (S && FlagsNZRequired()) TEST(32, R(RSCRATCH), R(RSCRATCH)); } @@ -331,7 +336,7 @@ void Compiler::A_Comp_SMULL_SMLAL() else { IMUL(64, RSCRATCH2, R(RSCRATCH3)); - if (S) + if (S && FlagsNZRequired()) TEST(64, R(RSCRATCH2), R(RSCRATCH2)); } @@ -345,9 +350,20 @@ void Compiler::A_Comp_SMULL_SMLAL() void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { - CPSRDirty = true; + if (CurInstr.SetFlags == 0) + return; + if (retriveCV && !(CurInstr.SetFlags & 0x3)) + retriveCV = false; bool carryOnly = !retriveCV && carryUsed; + if (carryOnly && !(CurInstr.SetFlags & 0x2)) + { + carryUsed = false; + carryOnly = false; + } + + CPSRDirty = true; + if (retriveCV) { SETcc(CC_O, R(RSCRATCH)); @@ -355,19 +371,28 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); } - SETcc(CC_S, R(RSCRATCH)); - SETcc(CC_Z, R(RSCRATCH3)); - LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); - int shiftAmount = 30; - if (retriveCV || carryUsed) + if (FlagsNZRequired()) { - LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); - shiftAmount = carryOnly ? 29 : 28; - } - SHL(32, R(RSCRATCH), Imm8(shiftAmount)); + SETcc(CC_S, R(RSCRATCH)); + SETcc(CC_Z, R(RSCRATCH3)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); + int shiftAmount = 30; + if (retriveCV || carryUsed) + { + LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); + shiftAmount = carryOnly ? 29 : 28; + } + SHL(32, R(RSCRATCH), Imm8(shiftAmount)); - AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); - OR(32, R(RCPSR), R(RSCRATCH)); + AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH)); + } + else + { + SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28)); + AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH2)); + } } // always uses RSCRATCH, RSCRATCH2 only if S == true @@ -523,7 +548,8 @@ void Compiler::T_Comp_ShiftImm() if (shifted != rd) MOV(32, rd, shifted); - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } @@ -557,7 +583,8 @@ void Compiler::T_Comp_ALU_Imm8() { case 0x0: MOV(32, rd, imm); - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, false); return; case 0x1: @@ -607,7 +634,8 @@ void Compiler::T_Comp_ALU() int shiftOp = op == 0x7 ? 3 : op - 0x2; bool carryUsed; OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); - TEST(32, shifted, shifted); + if (FlagsNZRequired()) + TEST(32, shifted, shifted); MOV(32, rd, shifted); Comp_RetriveFlags(false, false, true); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index ab13cb6..6abb2bb 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -342,6 +342,11 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { }; #undef F +bool Compiler::CanCompile(bool thumb, u16 kind) +{ + return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; +} + void Compiler::Reset() { memset(ResetStart, 0xcc, CodeMemSize); @@ -380,11 +385,15 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); + printf("block start %d\n", Thumb); + for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; + printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 3151cbc..8861884 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -29,6 +29,8 @@ public: void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); + bool CanCompile(bool thumb, u16 kind); + typedef void (Compiler::*CompileFunc)(); void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); @@ -64,7 +66,6 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); - void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); @@ -121,6 +122,9 @@ public: void LoadCPSR(); void SaveCPSR(); + bool FlagsNZRequired() + { return CurInstr.SetFlags & 0xC; } + Gen::FixupBranch CheckCondition(u32 cond); Gen::OpArg MapReg(int reg) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 4813799..ea6d827 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -5,7 +5,7 @@ namespace ARMInstrInfo { -#define ak(x) ((x) << 13) +#define ak(x) ((x) << 18) enum { A_Read0 = 1 << 0, @@ -26,69 +26,81 @@ enum { A_Link = 1 << 10, A_UnkOnARM7 = 1 << 11, + + A_SetNZ = 1 << 12, + A_SetCV = 1 << 13, + A_SetMaybeC = 1 << 14, + A_MulFlags = 1 << 15, + A_ReadC = 1 << 16, + A_RRXReadC = 1 << 17, }; #define A_BIOP A_Read16 #define A_MONOOP 0 -#define A_IMPLEMENT_ALU_OP(x,k) \ - const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \ - const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ - const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ - const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ - const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ - const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ - const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ - const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ - const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ +#define A_ARITH A_SetCV +#define A_LOGIC A_SetMaybeC +#define A_ARITH_IMM A_SetCV +#define A_LOGIC_IMM 0 + +#define A_IMPLEMENT_ALU_OP(x,k,a,c) \ + const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ \ - const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ - const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ - const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ - const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ - const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ - const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ - const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ - const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ - const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); - -A_IMPLEMENT_ALU_OP(AND,BIOP) -A_IMPLEMENT_ALU_OP(EOR,BIOP) -A_IMPLEMENT_ALU_OP(SUB,BIOP) -A_IMPLEMENT_ALU_OP(RSB,BIOP) -A_IMPLEMENT_ALU_OP(ADD,BIOP) -A_IMPLEMENT_ALU_OP(ADC,BIOP) -A_IMPLEMENT_ALU_OP(SBC,BIOP) -A_IMPLEMENT_ALU_OP(RSC,BIOP) -A_IMPLEMENT_ALU_OP(ORR,BIOP) -A_IMPLEMENT_ALU_OP(MOV,MONOOP) -A_IMPLEMENT_ALU_OP(BIC,BIOP) -A_IMPLEMENT_ALU_OP(MVN,MONOOP) + const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ + const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ + const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ + const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ + const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ + const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ + const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ + const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ + const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); + +A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0) const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM; -#define A_IMPLEMENT_ALU_TEST(x) \ - const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \ - const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ - const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ - const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ - const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ - const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ - const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ - const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ - const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); - -A_IMPLEMENT_ALU_TEST(TST) -A_IMPLEMENT_ALU_TEST(TEQ) -A_IMPLEMENT_ALU_TEST(CMP) -A_IMPLEMENT_ALU_TEST(CMN) - -const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL); -const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA); -const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL); -const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); -const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); -const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); +#define A_IMPLEMENT_ALU_TEST(x,a) \ + const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); + +A_IMPLEMENT_ALU_TEST(TST,LOGIC) +A_IMPLEMENT_ALU_TEST(TEQ,LOGIC) +A_IMPLEMENT_ALU_TEST(CMP,ARITH) +A_IMPLEMENT_ALU_TEST(CMN,ARITH) + +const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL); +const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA); +const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL); +const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); +const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); +const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); @@ -161,7 +173,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 16) +#define tk(x) ((x) << 20) enum { T_Read0 = 1 << 0, @@ -183,42 +195,47 @@ enum { T_ReadR14 = 1 << 13, T_WriteR14 = 1 << 14, - T_PopPC = 1 << 15 + T_PopPC = 1 << 15, + + T_SetNZ = 1 << 16, + T_SetCV = 1 << 17, + T_SetMaybeC = 1 << 18, + T_ReadC = 1 << 19 }; -const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM); -const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM); -const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM); - -const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); -const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); -const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_); -const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_); - -const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM); -const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM); -const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM); -const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM); - -const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG); -const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG); -const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG); -const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG); -const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG); -const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG); -const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG); -const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG); -const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG); -const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG); -const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG); -const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG); -const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG); -const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG); -const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG); -const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG); +const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); +const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM); +const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM); + +const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); +const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); +const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_); +const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_); + +const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM); +const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM); +const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM); +const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM); + +const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG); +const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG); +const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG); +const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG); +const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG); +const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG); +const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG); +const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG); +const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG); +const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG); +const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG); +const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG); +const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG); +const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG); +const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG); +const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG); const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG); -const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); +const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG); const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); @@ -268,10 +285,20 @@ const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC); Info Decode(bool thumb, u32 num, u32 instr) { + const u8 FlagsReadPerCond[7] = { + flag_Z, + flag_C, + flag_N, + flag_V, + flag_C | flag_Z, + flag_N | flag_V, + flag_Z | flag_N | flag_V}; + Info res = {0}; if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; + res.Kind = (data >> 20) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -309,7 +336,18 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_PopPC && instr & (1 << 8)) res.DstRegs |= 1 << 15; - res.Kind = (data >> 16) & 0x3F; + if (data & T_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & T_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & T_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if (data & T_ReadC) + res.ReadFlags |= flag_C; + + if (res.Kind == tk_BCOND) + res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7]; + res.EndBlock = res.Branches(); return res; @@ -323,7 +361,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_UnkOnARM7 && num != 0) data = A_UNK; - res.Kind = (data >> 13) & 0x1FF; + res.Kind = (data >> 18) & 0x1FF; if (res.Kind == ak_MCR) { @@ -382,6 +420,26 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right + if (data & A_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & A_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if ((data & A_MulFlags) && (instr & (1 << 20))) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_ReadC) + res.ReadFlags |= flag_C; + if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F)) + res.ReadFlags |= flag_C; + + if ((instr >> 28) < 0xE) + { + // make non conditional flag sets conditional + res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4); + res.ReadFlags |= FlagsReadPerCond[instr >> 29]; + } + res.EndBlock |= res.Branches(); return res; diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 4fe9b10..5336837 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -215,11 +215,24 @@ enum tk_Count }; +enum +{ + flag_N = 1 << 3, + flag_Z = 1 << 2, + flag_C = 1 << 1, + flag_V = 1 << 0, +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 ReadFlags; + // lower 4 bits - set always + // upper 4 bits - might set flag + u8 WriteFlags; + bool EndBlock; bool Branches() { diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index 0066668..c3db88d 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,6 +2675,8 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { + freopen("miauz.txt", "w", stdout); + srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From ea562d2fec9f4ab73e9ff3f519ff5ecb65736cd7 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 25 Aug 2019 13:06:27 +0200 Subject: fixes for flag optimisation --- src/ARMJIT.cpp | 1 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +- src/ARM_InstrInfo.cpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 3b6bc2e..5d92e47 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -163,6 +163,7 @@ CompiledBlock CompileBlock(ARM* cpu) { r15 += thumb ? 2 : 4; + instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 6a7d711..f868ddf 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -387,7 +387,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); OR(32, R(RCPSR), R(RSCRATCH)); } - else + else if (carryUsed || retriveCV) { SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28)); AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index ea6d827..3634c35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -436,7 +436,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional - res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4); + res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0; res.ReadFlags |= FlagsReadPerCond[instr >> 29]; } -- cgit v1.2.3 From 5202c505abe96e39814e9141e9487e3b549f28a4 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 25 Aug 2019 13:09:03 +0200 Subject: remove debug printing --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ---- src/libui_sdl/main.cpp | 2 -- 2 files changed, 6 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 6abb2bb..5e05446 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -385,15 +385,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); - printf("block start %d\n", Thumb); - for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; - printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags); - CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index c3db88d..0066668 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { - freopen("miauz.txt", "w", stdout); - srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From 2ef776883f286f938fe03700780544c56867e467 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 8 Sep 2019 14:09:00 +0200 Subject: more fixes for flag optimisation + small cycle counting optimisation --- src/ARMJIT_x64/ARMJIT_Branch.cpp | 4 ++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 28 ++++++++--- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 4 ++ src/ARM_InstrInfo.cpp | 92 ++++++++++++++++++++++--------------- 5 files changed, 86 insertions(+), 44 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 30b18d7..c0a8f1f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -19,6 +19,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // it's not completely safe to assume stuff like, which instructions to preload // we'll see how it works out + IrregularCycles = true; + u32 newPC; u32 cycles = 0; @@ -140,6 +142,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { + IrregularCycles = true; + BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); bool previouslyDirty = CPSRDirty; SaveCPSR(); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 5e05446..d585f39 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -447,6 +447,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Comp_AddCycles_C(); else { + IrregularCycles = false; + FixupBranch skipExecute; if (cond < 0xE) skipExecute = CheckCondition(cond); @@ -463,13 +465,19 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (CurInstr.Cond() < 0xE) { - FixupBranch skipFailed = J(); - SetJumpTarget(skipExecute); + if (IrregularCycles) + { + FixupBranch skipFailed = J(); + SetJumpTarget(skipExecute); - Comp_AddCycles_C(); + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); + } + else + SetJumpTarget(skipExecute); } + } } @@ -518,8 +526,16 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - LEA(32, RSCRATCH, MDisp(i, add + cycles)); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + if (!Thumb && CurInstr.Cond() < 0xE) + { + LEA(32, RSCRATCH, MDisp(i, add + cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + } + else + { + ConstantCycles += i + cycles; + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); + } } } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 8861884..a62f043 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -139,6 +139,8 @@ public: u8* ResetStart; u32 CodeMemSize; + bool IrregularCycles; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 3b4cb7d..bf8280d 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -438,6 +438,8 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) { + IrregularCycles = true; + if (store) MOV(32, R(ABI_PARAM2), rd); u32 cycles = Num @@ -459,6 +461,8 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { + IrregularCycles = true; + int regsCount = regs.Count(); if (decrement) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 3634c35..9239e29 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -5,7 +5,7 @@ namespace ARMInstrInfo { -#define ak(x) ((x) << 18) +#define ak(x) ((x) << 21) enum { A_Read0 = 1 << 0, @@ -33,13 +33,21 @@ enum { A_MulFlags = 1 << 15, A_ReadC = 1 << 16, A_RRXReadC = 1 << 17, + A_StaticShiftSetC = 1 << 18, + A_SetC = 1 << 19, + + A_WriteMemory = 1 << 20, }; #define A_BIOP A_Read16 #define A_MONOOP 0 -#define A_ARITH A_SetCV -#define A_LOGIC A_SetMaybeC +#define A_ARITH_LSL_IMM A_SetCV +#define A_LOGIC_LSL_IMM A_StaticShiftSetC +#define A_ARITH_SHIFT_IMM A_SetCV +#define A_LOGIC_SHIFT_IMM A_SetC +#define A_ARITH_SHIFT_REG A_SetCV +#define A_LOGIC_SHIFT_REG A_SetMaybeC #define A_ARITH_IMM A_SetCV #define A_LOGIC_IMM 0 @@ -55,14 +63,14 @@ enum { const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ \ const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ - const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ - const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ - const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ - const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ - const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ - const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ - const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ - const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); + const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a##_LSL_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ + const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ + const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ + const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a##_SHIFT_IMM | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ + const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ + const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ + const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ + const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a##_SHIFT_REG | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0) A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0) @@ -80,15 +88,15 @@ A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0) const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM; #define A_IMPLEMENT_ALU_TEST(x,a) \ - const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \ - const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ - const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ - const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ - const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ - const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ - const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ - const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ - const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); + const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a##_IMM | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a##_LSL_IMM | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a##_SHIFT_IMM | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a##_SHIFT_REG | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); A_IMPLEMENT_ALU_TEST(TST,LOGIC) A_IMPLEMENT_ALU_TEST(TEQ,LOGIC) @@ -115,20 +123,20 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 +#define A_STR A_Read12 | A_WriteMemory #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ const u32 A_##x##_REG_LSL = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSL); \ const u32 A_##x##_REG_LSR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_LSR); \ const u32 A_##x##_REG_ASR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ASR); \ - const u32 A_##x##_REG_ROR = A_##k | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \ + const u32 A_##x##_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_MemWriteback | A_Read0 | ak(ak_##x##_REG_ROR); \ \ const u32 A_##x##_POST_IMM = A_##k | A_Read16 | A_Write16 | ak(ak_##x##_POST_IMM); \ const u32 A_##x##_POST_REG_LSL = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSL); \ const u32 A_##x##_POST_REG_LSR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_LSR); \ const u32 A_##x##_POST_REG_ASR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ASR); \ - const u32 A_##x##_POST_REG_ROR = A_##k | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR); + const u32 A_##x##_POST_REG_ROR = A_##k | A_RRXReadC | A_Read16 | A_Write16 | A_Read0 | ak(ak_##x##_POST_REG_ROR); A_IMPLEMENT_WB_LDRSTR(STR,STR) A_IMPLEMENT_WB_LDRSTR(STRB,STR) @@ -136,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double +#define A_STRD A_Read12Double | A_WriteMemory #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -151,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -173,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 20) +#define tk(x) ((x) << 21) enum { T_Read0 = 1 << 0, @@ -200,12 +208,13 @@ enum { T_SetNZ = 1 << 16, T_SetCV = 1 << 17, T_SetMaybeC = 1 << 18, - T_ReadC = 1 << 19 + T_ReadC = 1 << 19, + T_SetC = 1 << 20, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); -const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM); -const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM); +const u32 T_LSR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_LSR_IMM); +const u32 T_ASR_IMM = T_SetNZ | T_SetC | T_Write0 | T_Read3 | tk(tk_ASR_IMM); const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); @@ -213,7 +222,7 @@ const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_); const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_); const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM); -const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM); +const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Read8 | tk(tk_CMP_IMM); const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM); const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM); @@ -240,7 +249,7 @@ const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG); const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); -const u32 T_ADD_SP = T_WriteR13 | tk(tk_ADD_SP); +const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); @@ -298,7 +307,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 20) & 0x3F; + res.Kind = (data >> 21) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -344,12 +353,14 @@ Info Decode(bool thumb, u32 num, u32 instr) res.WriteFlags |= flag_C << 4; if (data & T_ReadC) res.ReadFlags |= flag_C; + if (data & T_SetC) + res.WriteFlags |= flag_C; + + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7]; - res.EndBlock = res.Branches(); - return res; } else @@ -361,7 +372,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_UnkOnARM7 && num != 0) data = A_UNK; - res.Kind = (data >> 18) & 0x1FF; + res.Kind = (data >> 21) & 0x1FF; if (res.Kind == ak_MCR) { @@ -369,7 +380,7 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 cm = instr & 0xF; u32 cpinfo = (instr >> 5) & 0x7; u32 id = (cn<<8)|(cm<<4)|cpinfo; - if (id == 0x704 || id == 0x782) + if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) @@ -420,6 +431,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right + if (res.Kind == ak_STM) + res.SrcRegs |= instr & (1 << 15); + if (data & A_SetNZ) res.WriteFlags |= flag_N | flag_Z; if (data & A_SetCV) @@ -432,6 +446,8 @@ Info Decode(bool thumb, u32 num, u32 instr) res.ReadFlags |= flag_C; if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F)) res.ReadFlags |= flag_C; + if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) + res.WriteFlags |= flag_C; if ((instr >> 28) < 0xE) { -- cgit v1.2.3 From 5338c28f408382263077b24bce5d5ab62bdf7024 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 8 Sep 2019 14:48:20 +0200 Subject: load register only if needed - do thumb bl long merge in the first step - preparations for better branch jitting --- src/ARMJIT.cpp | 16 ++++++++++++++++ src/ARMJIT.h | 1 + src/ARMJIT_RegisterCache.h | 12 ++++++++---- src/ARMJIT_x64/ARMJIT_Branch.cpp | 12 +++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 34 ++++++++++++---------------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 +- src/ARM_InstrInfo.h | 3 +++ 7 files changed, 48 insertions(+), 32 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 5d92e47..85cadf3 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -159,6 +159,7 @@ CompiledBlock CompileBlock(ARM* cpu) u32 r15 = cpu->R[15]; cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; + u32 nextInstrAddr[2] = {blockAddr, r15}; do { r15 += thumb ? 2 : 4; @@ -166,6 +167,10 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; + + instrs[i].Addr = nextInstrAddr[0]; + nextInstrAddr[0] = nextInstrAddr[1]; + nextInstrAddr[1] = r15; if (cpu->Num == 0) { @@ -193,8 +198,19 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 + && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) + { + instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG; + instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16); + instrs[i - 1].Info.DstRegs = 0xC000; + instrs[i - 1].Info.SrcRegs = 0; + instrs[i - 1].Info.EndBlock = true; + i--; + } i++; + bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 6197695..7e448ef 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -31,6 +31,7 @@ struct FetchedInstr u8 SetFlags; u32 Instr; u32 NextInstr[2]; + u32 Addr; u8 CodeCycles; diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 04c1eda..fe2f203 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -38,7 +38,7 @@ public: Mapping[reg] = (Reg)-1; } - void LoadRegister(int reg) + void LoadRegister(int reg, bool loadValue) { assert(Mapping[reg] == -1); for (int i = 0; i < NativeRegsAvailable; i++) @@ -50,7 +50,8 @@ public: NativeRegsUsed |= 1 << (int)nativeReg; LoadedRegs |= 1 << reg; - Compiler->LoadReg(reg, nativeReg); + if (loadValue) + Compiler->LoadReg(reg, nativeReg); return; } @@ -66,7 +67,7 @@ public: UnloadRegister(reg); } - void Prepare(int i) + void Prepare(bool thumb, int i) { u16 futureNeeded = 0; int ranking[16]; @@ -111,8 +112,11 @@ public: loadedSet.m_val = LoadedRegs; } + BitSet16 needValueLoaded(needToBeLoaded); + if (thumb || Instr.Cond() >= 0xE) + needValueLoaded = BitSet16(Instr.Info.SrcRegs); for (int reg : needToBeLoaded) - LoadRegister(reg); + LoadRegister(reg, needValueLoaded[reg]); } DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index c0a8f1f..cc7a3c4 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -271,15 +271,17 @@ void Compiler::T_Comp_BL_LONG_2() Comp_JumpTo(RSCRATCH); } -void Compiler::T_Comp_BL_Merged(FetchedInstr part1) +void Compiler::T_Comp_BL_Merged() { - assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1); Comp_AddCycles_C(); - u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9); - target += (CurInstr.Instr & 0x7FF) << 1; + R15 += 2; - if (Num == 1 || CurInstr.Instr & (1 << 12)) + u32 upperPart = CurInstr.Instr >> 16; + u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9); + target += (upperPart & 0x7FF) << 1; + + if (Num == 1 || upperPart & (1 << 12)) target |= 1; MOV(32, MapReg(14), Imm32((R15 - 2) | 1)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d585f39..d8ce1aa 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -338,7 +338,8 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { // Branch F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), // Unk, SVC - NULL, NULL + NULL, NULL, + F(T_Comp_BL_Merged) }; #undef F @@ -361,21 +362,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ConstantCycles = 0; Thumb = cpu->CPSR & 0x20; Num = cpu->Num; - R15 = cpu->R[15]; CodeRegion = cpu->CodeRegion; CurCPU = cpu; CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); if (!(Num == 0 - ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) - : IsMapped<1>(R15 - (Thumb ? 2 : 4)))) + ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) + : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) { printf("Trying to compile a block in unmapped memory\n"); } - bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -387,8 +385,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs for (int i = 0; i < instrsCount; i++) { - R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; + R15 = CurInstr.Addr + (Thumb ? 4 : 8); CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] @@ -406,29 +404,21 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } if (comp != NULL) - RegCache.Prepare(i); + RegCache.Prepare(Thumb, i); else RegCache.Flush(); if (Thumb) { - if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1 - && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2) - mergedThumbBL = true; - else + u32 icode = (CurInstr.Instr >> 6) & 0x3FF; + if (comp == NULL) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; - if (comp == NULL) - { - MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); - } - else if (mergedThumbBL) - T_Comp_BL_Merged(instrs[i - 1]); - else - (this->*comp)(); + ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); } + else + (this->*comp)(); } else { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index a62f043..fcb2380 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -90,7 +90,7 @@ public: void T_Comp_BranchXchangeReg(); void T_Comp_BL_LONG_1(); void T_Comp_BL_LONG_2(); - void T_Comp_BL_Merged(FetchedInstr prefix); + void T_Comp_BL_Merged(); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 5336837..d01c600 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -212,6 +212,9 @@ enum tk_UNK, tk_SVC, + // not a real instruction + tk_BL_LONG, + tk_Count }; -- cgit v1.2.3 From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:10:59 +0200 Subject: new block cache and much more... - more reliable code invalidation detection - blocks aren't stopped at any branch, but are being followed if possible to get larger blocks - idle loop recognition - optimised literal loads, load/store cycle counting and loads/stores from constant addresses --- src/ARM.cpp | 44 ++- src/ARM.h | 16 +- src/ARMInterpreter.h | 9 + src/ARMJIT.cpp | 755 ++++++++++++++++++++++++++++++------ src/ARMJIT.h | 141 ++----- src/ARMJIT_Internal.h | 198 ++++++++++ src/ARMJIT_RegisterCache.h | 36 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 16 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 184 +++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 51 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++---------------- src/ARM_InstrInfo.cpp | 47 ++- src/ARM_InstrInfo.h | 11 +- src/CP15.cpp | 12 +- src/Config.cpp | 2 + src/Config.h | 1 + src/NDS.cpp | 22 +- src/libui_sdl/DlgEmuSettings.cpp | 22 +- 19 files changed, 1550 insertions(+), 689 deletions(-) create mode 100644 src/ARMJIT_Internal.h (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index e404943..423c940 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) { NDS::ARM9Timestamp = NDS::ARM9Target; } break; } - if (IRQ) TriggerIRQ(); - - NDS::ARM9Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT() printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; // TODO optimize this shit!!! + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) { NDS::ARM7Timestamp = NDS::ARM7Target; } break; } - - if (IRQ) TriggerIRQ(); - - NDS::ARM7Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT() void ARMv5::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) @@ -758,6 +770,8 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { NextInstr[0] = CodeRead16(R[15] - 2); diff --git a/src/ARM.h b/src/ARM.h index 4d387bc..8a01068 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -299,7 +299,7 @@ public: { *val = NDS::ARM7Read8(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -308,7 +308,7 @@ public: *val = NDS::ARM7Read16(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -317,7 +317,7 @@ public: *val = NDS::ARM7Read32(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -325,14 +325,14 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -341,7 +341,7 @@ public: NDS::ARM7Write16(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -350,7 +350,7 @@ public: NDS::ARM7Write32(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -358,7 +358,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 85cadf3..686bdd6 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,122 +1,137 @@ #include "ARMJIT.h" #include +#include #include "Config.h" +#include "ARMJIT_Internal.h" #include "ARMJIT_x64/ARMJIT_Compiler.h" +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" + namespace ARMJIT { +#define JIT_DEBUGPRINT(msg, ...) + Compiler* compiler; -BlockCache cache; -#define DUP2(x) x, x +const u32 ExeMemRegionSizes[] = { + 0x8000, // Unmapped Region (dummy) + 0x8000, // ITCM + 4*1024*1024, // Main RAM + 0x8000, // SWRAM + 0xA4000, // LCDC + 0x8000, // ARM9 BIOS + 0x4000, // ARM7 BIOS + 0x10000, // ARM7 WRAM + 0x40000 // ARM7 WVRAM +}; -static ptrdiff_t JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), - /* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ DUP2(offsetof(BlockCache, SWRAM)), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ -1, - offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS)) - }, - //arm7 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)), - /* 1X*/ DUP2(-1), - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ offsetof(BlockCache, SWRAM), - offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(-1) - } +const u32 ExeMemRegionOffsets[] = { + 0, + 0x8000, + 0x10000, + 0x410000, + 0x418000, + 0x4BC000, + 0x4C4000, + 0x4C8000, + 0x4D8000, + 0x518000, }; -static u32 JIT_MASK[2][32] = { +#define DUP2(x) x, x + +const static ExeMemKind JIT_MEM[2][32] = { //arm9 { - /* 0X*/ DUP2(0x00007FFF), - /* 1X*/ DUP2(0x00007FFF), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ DUP2(0x00007FFF), - /* 4X*/ DUP2(0x00000000), - /* 5X*/ DUP2(0x00000000), - /* 6X*/ 0x00000000, - 0x000FFFFF, - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00007FFF) + /* 0X*/ DUP2(exeMem_ITCM), + /* 1X*/ DUP2(exeMem_ITCM), // mirror + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ DUP2(exeMem_SWRAM), + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ exeMem_Unmapped, + exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_ARM9_BIOS) }, //arm7 { - /* 0X*/ DUP2(0x00003FFF), - /* 1X*/ DUP2(0x00000000), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ 0x00007FFF, - 0x0000FFFF, - /* 4X*/ 0x00000000, - 0x0000FFFF, - /* 5X*/ DUP2(0x00000000), - /* 6X*/ DUP2(0x0003FFFF), - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00000000) + /* 0X*/ DUP2(exeMem_ARM7_BIOS), + /* 1X*/ DUP2(exeMem_Unmapped), + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ exeMem_SWRAM, + exeMem_ARM7_WRAM, + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, + DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_Unmapped) } }; #undef DUP2 +/* + translates address to pseudo physical address + - more compact, eliminates mirroring, everything comes in a row + - we only need one translation table +*/ +u32 AddrTranslate9[0x2000]; +u32 AddrTranslate7[0x4000]; -void Init() +JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; +AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +TinyVector JitBlocks; +JitBlock* RestoreCandidates[0x1000] = {NULL}; + +u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) { - memset(&cache, 0, sizeof(BlockCache)); + return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); +} +void Init() +{ for (int i = 0; i < 0x2000; i++) - cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) - + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + { + ExeMemKind kind = JIT_MEM[0][i >> 8]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); + } for (int i = 0; i < 0x4000; i++) - cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) - + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); + { + ExeMemKind kind = JIT_MEM[1][i >> 9]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); + } compiler = new Compiler(); } @@ -126,7 +141,7 @@ void DeInit() delete compiler; } -void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { @@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -CompiledBlock CompileBlock(ARM* cpu) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + } + else + { + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + NULL // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu) if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + if (!(cpu->Num == 0 + ? IsMapped<0>(blockAddr) + : IsMapped<1>(blockAddr))) + { + printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); + } + + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr<0>(blockAddr) + : TranslateAddr<1>(blockAddr); + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + + u32 addresseRanges[32] = {}; + u32 numAddressRanges = 0; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", + blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + + u32 lastSegmentStart = blockAddr; + do { r15 += thumb ? 2 : 4; + instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; @@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + u32 translatedAddr = (cpu->Num == 0 + ? TranslateAddr<0>(instrs[i].Addr) + : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addresseRanges[j] == translatedAddr) + { + returning = true; + break; + } + } + if (!returning) + addresseRanges[numAddressRanges++] = translatedAddr; + } if (cpu->Num == 0) { @@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i - 1].Info.EndBlock = true; i--; } - i++; + if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + u32 cond, target; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 offset = (target - blockAddr) / (thumb ? 2 : 4); + if (IsIdleLoop(instrs + offset, i - offset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + u32 targetPseudoPhysical = cpu->Num == 0 + ? TranslateAddr<0>(target) + : TranslateAddr<1>(target); + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); - if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) - floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); - floodFillSetFlags(instrs, i - 1, 0xF); + u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); + JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + bool mayRestore = true; + if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + { + RestoreCandidates[restoreSlot] = NULL; + if (prevBlock->NumInstrs == i) + { + for (int j = 0; j < i; j++) + { + if (prevBlock->Instrs()[j] != instrs[j].Instr) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; - CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); + if (prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } - if (cpu->Num == 0) - InsertBlock<0>(blockAddr, block); + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(i, numAddressRanges); + for (int j = 0; j < i; j++) + block->Instrs()[j] = instrs[j].Instr; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addresseRanges[j]; + + block->StartAddr = blockAddr; + block->PseudoPhysicalAddr = pseudoPhysicalAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + } else - InsertBlock<1>(blockAddr, block); + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addresseRanges[j] == block->AddressRanges()[j]); + CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + } + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - return block; + JitBlocks.Add(block); } -void InvalidateBlockCache() +void InvalidateByAddr(u32 pseudoPhysical) { - printf("Resetting JIT block cache...\n"); + JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + int startLength = range->Blocks.Length; + for (int i = 0; i < range->Blocks.Length; i++) + { + assert(range->Blocks.Length == startLength); + JitBlock* block = range->Blocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 256) != (pseudoPhysical / 256)) + { + AddressRange* otherRange = &CodeRanges[addr / 256]; + assert(otherRange != range); + assert(otherRange->Blocks.RemoveByValue(block)); + } + } + + assert(JitBlocks.RemoveByValue(block)); + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + if ((range->TimesInvalidated + 1) > range->TimesInvalidated) + range->TimesInvalidated++; + + range->Blocks.Clear(); +} + +void InvalidateByAddr7(u32 addr) +{ + u32 pseudoPhysical = TranslateAddr<1>(addr); + if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateITCM(u32 addr) +{ + u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; + if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateAll() +{ + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeRanges[addr / 256]; + range->Blocks.Clear(); + if (range->TimesInvalidated + 1 > range->TimesInvalidated) + range->TimesInvalidated++; + } + + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + + JitBlocks.Clear(); +} + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); + for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + { + if (RestoreCandidates[i]) + { + delete RestoreCandidates[i]; + RestoreCandidates[i] = NULL; + } + } + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 256].Blocks.Clear(); + CodeRanges[addr / 256].TimesInvalidated = 0; + } + delete block; + } + JitBlocks.Clear(); compiler->Reset(); } +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + if ((addr & 0xFF000000) == 0x04000000) + { + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size == 16) + { + if (store) + return (void*)Wifi::Write; + else + return (void*)Wifi::Read; + } + break; + } + } + return NULL; +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 7e448ef..1db4d66 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,142 +9,67 @@ namespace ARMJIT { -typedef u32 (*CompiledBlock)(); - -struct FetchedInstr +enum ExeMemKind { - u32 A_Reg(int pos) const - { - return (Instr >> pos) & 0xF; - } - - u32 T_Reg(int pos) const - { - return (Instr >> pos) & 0x7; - } - - u32 Cond() const - { - return Instr >> 28; - } - - u8 SetFlags; - u32 Instr; - u32 NextInstr[2]; - u32 Addr; - - u8 CodeCycles; - - ARMInstrInfo::Info Info; + exeMem_Unmapped = 0, + exeMem_ITCM, + exeMem_MainRAM, + exeMem_SWRAM, + exeMem_LCDC, + exeMem_ARM9_BIOS, + exeMem_ARM7_BIOS, + exeMem_ARM7_WRAM, + exeMem_ARM7_WVRAM, + exeMem_Count }; -/* - Copied from DeSmuME - Some names where changed to match the nomenclature of melonDS +extern const u32 ExeMemRegionOffsets[]; +extern const u32 ExeMemRegionSizes[]; - Since it's nowhere explained and atleast I needed some time to get behind it, - here's a summary on how it works: - more or less all memory locations from which code can be executed are - represented by an array of function pointers, which point to null or - a function which executes a block instructions starting from there. +typedef u32 (*JitBlockEntry)(); - The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which - a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB - are the sizes of the smallest contigous memory region mapped to the respective CPU. - Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, - we only need every second half word to be adressable. +extern u32 AddrTranslate9[0x2000]; +extern u32 AddrTranslate7[0x4000]; - In case a memory write hits mapped memory, the function block at this - address is set to null, so it's recompiled the next time it's executed. - - This method has disadvantages, namely that only writing to the - first instruction of a block marks it as invalid and that memory remapping - (SWRAM and VRAM) isn't taken into account. -*/ - -struct BlockCache -{ - CompiledBlock* AddrMapping9[0x2000] = {0}; - CompiledBlock* AddrMapping7[0x4000] = {0}; - - CompiledBlock MainRAM[4*1024*1024/2]; - CompiledBlock SWRAM[0x8000/2]; // Shared working RAM - CompiledBlock ARM9_ITCM[0x8000/2]; - CompiledBlock ARM9_LCDC[0xA4000/2]; - CompiledBlock ARM9_BIOS[0x8000/2]; - CompiledBlock ARM7_BIOS[0x4000/2]; - CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM -}; - -extern BlockCache cache; +const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... +extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; } template -inline CompiledBlock LookUpBlock(u32 addr) +inline u32 TranslateAddr(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } template -inline void Invalidate16(u32 addr) +inline JitBlockEntry LookUpBlock(u32 addr) { - if (IsMapped(addr)) - { - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; - } -} - -template -inline void Invalidate32(u32 addr) -{ - if (IsMapped(addr)) - { - if (num == 0) - { - CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; - page[(addr & 0x7FFF) >> 1] = NULL; - page[((addr + 2) & 0x7FFF) >> 1] = NULL; - } - else - { - CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; - } - } -} - -template -inline void InsertBlock(u32 addr, CompiledBlock func) -{ - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + return FastBlockAccess[TranslateAddr(addr) / 2]; } void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateAll(); + +void InvalidateITCM(u32 addr); +void InvalidateByAddr7(u32 addr); + +void CompileBlock(ARM* cpu); -void InvalidateBlockCache(); +void ResetBlockCache(); } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..4acb488 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,198 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include + +#include "ARMJIT.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2 +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 NextInstr[2]; + u32 Addr; + + u8 CodeCycles; + u8 DataCycles; + u8 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u32 Length = 0; // make it 32 bit so we don't need movzx + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(Data) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 numInstrs, u32 numAddresses) + { + NumInstrs = numInstrs; + NumAddresses = numAddresses; + Data = new u32[numInstrs + numAddresses]; + } + + ~JitBlock() + { + delete[] Data; + } + + u32 StartAddr; + u32 PseudoPhysicalAddr; + + u32 NumInstrs; + u32 NumAddresses; + + JitBlockEntry EntryPoint; + + u32* Instrs() + { return Data; } + u32* AddressRanges() + { return Data + NumInstrs; } + +private: + /* + 0.. Blocks; + u16 TimesInvalidated; +}; + +extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index fe2f203..ed6a2b7 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -60,15 +60,46 @@ public: assert("Welp!"); } + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + void Flush() { BitSet16 loadedSet(LoadedRegs); for (int reg : loadedSet) UnloadRegister(reg); + LiteralsLoaded = 0; } void Prepare(bool thumb, int i) { + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + u16 futureNeeded = 0; int ranking[16]; for (int j = 0; j < 16; j++) @@ -86,7 +117,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; + FetchedInstr Instr = Instrs[i]; u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -125,6 +156,9 @@ public: static const int NativeRegsAvailable; Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; u32 NativeRegsUsed = 0; u16 LoadedRegs = 0; u16 DirtyRegs = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f868ddf..14c223b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp() MOV(32, rd, op2); if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); if (S) { @@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); - if (op & 1) + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); @@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU() u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) - Comp_AddCycles_CI(1); + Comp_AddCycles_CI(1); // shift by reg else Comp_AddCycles_C(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cc7a3c4..0dedb3f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -16,9 +16,6 @@ int squeezePointer(T* ptr) void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { // we can simplify constant branches by a lot - // it's not completely safe to assume stuff like, which instructions to preload - // we'll see how it works out - IrregularCycles = true; u32 newPC; @@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { ARMv5* cpu9 = (ARMv5*)CurCPU; - u32 oldregion = R15 >> 24; - u32 newregion = addr >> 24; - u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; - MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - - bool setupRegion = newregion != oldregion; - if (setupRegion) - cpu9->SetupCodeMem(addr); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); if (addr & 0x1) { @@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cycles += cpu9->CodeCycles; } - MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); - cpu9->RegionCodeCycles = compileTimeCodeCycles; - if (setupRegion) - cpu9->SetupCodeMem(R15); } else { @@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeRegion = codeRegion; cpu7->CodeCycles = codeCycles; - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } if (addr & 0x1) { @@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = addr >> 15; } - MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else @@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); + Comp_SpecialBranchBehaviour(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d8ce1aa..25c55a3 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -72,12 +72,15 @@ Compiler::Compiler() for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) - { MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); - MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); - } } + MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; + MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; + MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; + MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; + MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; + MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -179,12 +182,13 @@ void Compiler::LoadCPSR() MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); } -void Compiler::SaveCPSR() +void Compiler::SaveCPSR(bool flagClean) { if (CPSRDirty) { MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); - CPSRDirty = false; + if (flagClean) + CPSRDirty = false; } } @@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) // invalidates RSCRATCH and RSCRATCH3 Gen::FixupBranch Compiler::CheckCondition(u32 cond) { + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); if (cond >= 0x8) { static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); @@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - return J_CC(CC_Z); + return J_CC(CC_Z, ldmStm); } else { // could have used a LUT, but then where would be the fun? TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - return J_CC(cond & 1 ? CC_NZ : CC_Z); + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); } } @@ -354,25 +361,34 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +void Compiler::Comp_SpecialBranchBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... - InvalidateBlockCache(); + ResetBlockCache(); ConstantCycles = 0; - Thumb = cpu->CPSR & 0x20; + Thumb = thumb; Num = cpu->Num; - CodeRegion = cpu->CodeRegion; + CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - - if (!(Num == 0 - ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) - : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) - { - printf("Trying to compile a block in unmapped memory\n"); - } + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); - // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; - if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + SaveCPSR(); + } } - + if (comp != NULL) RegCache.Prepare(Thumb, i); else @@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); } else (this->*comp)(); @@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } } else if (cond == 0xF) + { Comp_AddCycles_C(); + } else { IrregularCycles = false; @@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (cond < 0xE) skipExecute = CheckCondition(cond); - u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); } else (this->*comp)(); + Comp_SpecialBranchBehaviour(); + if (CurInstr.Cond() < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + SetJumpTarget(skipFailed); } else @@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + return res; } @@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } } +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index fcb2380..792ff66 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,6 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" namespace ARMJIT @@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +struct ComplexOperand +{ + ComplexOperand() + {} + + ComplexOperand(u32 imm) + : IsImm(true), Imm(imm) + {} + ComplexOperand(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; class Compiler : public Gen::XEmitter { @@ -24,7 +51,7 @@ public: void Reset(); - CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -39,6 +66,8 @@ public: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); enum { @@ -92,8 +121,17 @@ public: void T_Comp_BL_LONG_2(); void T_Comp_BL_Merged(); - void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + void Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -105,8 +143,9 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void Comp_SpecialBranchBehaviour(); + void* Gen_MemoryRoutine9(bool store, int size); - void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); void* Gen_MemoryRoutineSeq9(bool store, bool preinc); void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); @@ -117,10 +156,9 @@ public: Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); - Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); - void SaveCPSR(); + void SaveCPSR(bool flagClean = true); bool FlagsNZRequired() { return CurInstr.SetFlags & 0xC; } @@ -139,10 +177,11 @@ public: u8* ResetStart; u32 CodeMemSize; + bool Exit; bool IrregularCycles; void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2][2]; + void* MemoryFuncs7[3][2]; void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index bf8280d..13ca415 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -27,51 +27,7 @@ int squeezePointer(T* ptr) /* address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) - code cycles - ABI_PARAM3 */ - -#define CALC_CYCLES_9(numC, numD, scratch) \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ - CMP(32, R(numC), R(numD)); \ - CMOVcc(32, numD, R(numC), CC_G); \ - CMP(32, R(numD), R(scratch)); \ - CMOVcc(32, scratch, R(numD), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); -#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - LEA(32, scratch, MRegSum(numD, numC)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - if (!store) \ - ADD(32, R(numC), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } -#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - if (!store) \ - ADD(32, R(numD), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } - void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); FixupBranch insideITCM = J_CC(CC_B); - // cycle counting! - MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); - SHR(32, R(ABI_PARAM4), Imm8(12)); - MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); - CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) { if (size > 8) @@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) { MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + + // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! + static_assert(sizeof(AddressRange) == 16); + LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + JMP((u8*)InvalidateByAddr, true); + SetJumpTarget(noCode); } else { @@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) -{ - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); - AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); - if (store) - { - MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); - } - else - { - MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); - - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM7Write32, true); break; - case 16: JMP((u8*)NDS::ARM7Write16, true); break; - case 8: JMP((u8*)NDS::ARM7Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM7Read16, true); - } - else - JMP((u8*)NDS::ARM7Read8, true); - } - - return res; -} - #define MEMORY_SEQ_WHILE_COND \ if (!store) \ MOV(32, currentElement, R(EAX));\ @@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) ABI_PARAM1 address ABI_PARAM2 address where registers are stored ABI_PARAM3 how many values to read/write - ABI_PARAM4 code cycles Dolphin x64CodeEmitter is my favourite assembler */ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); - - FixupBranch finishIt1 = J(); + RET(); SetJumpTarget(insideDTCM); AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); @@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time - MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential - FixupBranch finishIt2 = J(); + RET(); SetJumpTarget(insideITCM); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); @@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { MOV(32, R(ABI_PARAM4), currentElement); MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + + ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(ABI_PARAM4), R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); + CALL((u8*)InvalidateByAddr); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + SetJumpTarget(noCode); } else MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); - MOV(32, R(ABI_PARAM2), Imm32(1)); - - SetJumpTarget(finishIt1); - SetJumpTarget(finishIt2); - - POP(ABI_PARAM4); - POP(ABI_PARAM3); - - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); - - CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) RET(); return res; @@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + RET(); - POP(ABI_PARAM4); - POP(ABI_PARAM3); + return res; +} - // TODO: optimise this - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); +#undef MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); +void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + CurCPU->DataRead16(addr & ~0x1, &val); + else + CurCPU->DataRead8(addr, &val); + CurCPU->R[15] = tmpR15; - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); + MOV(32, MapReg(rd), Imm32(val)); - return res; + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + Comp_AddCycles_CDI(); } -#undef CALC_CYCLES_9 -#undef MEMORY_SEQ_WHILE_COND +void fault(u32 a, u32 b) +{ + printf("actually not static! %x %x\n", a, b); +} -void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) +void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - IrregularCycles = true; + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - if (store) - MOV(32, R(ABI_PARAM2), rd); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - MOV(32, R(ABI_PARAM3), Imm32(cycles)); - CALL(Num == 0 - ? MemoryFuncs9[size >> 4][store] - : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; - if (!store) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { - if (signExtend) - MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + Comp_MemLoadLiteral(size, rd, + R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + } + else + { + OpArg rdMapped = MapReg(rd); + OpArg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memoryFunc = Num == 0 + ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] + : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; + + if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); + MOV(32, R(ABI_PARAM1), Imm32(R15)); + MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + CMP(32, R(RSCRATCH), Imm32(addr)); + FixupBranch eq = J_CC(CC_E); + CALL((void*)fault); + SetJumpTarget(eq);*/ + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + if (flags & memop_Store) + { + MOV(size, M(ptr), MapReg(rd)); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + + if (size == 32 && addr & ~0x3) + { + ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + } + } + + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memoryFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + X64Reg finalAddr = ABI_PARAM1; + if (flags & memop_Post) + { + MOV(32, R(ABI_PARAM1), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } else - MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + if (flags & memop_Store) + MOV(32, R(ABI_PARAM2), rdMapped); + + if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) + MOV(32, rdMapped, R(ABI_PARAM1)); + + if (inlinePreparation && size > 8) + AND(32, R(ABI_PARAM1), Imm8(addressMask)); + + CALL(memoryFunc); + + if (!(flags & memop_Store)) + { + if (inlinePreparation && size == 32) + { + if (constLocalROR32 == 4) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rdMapped); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else if (constLocalROR32 != 0) + ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); + } + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } } } @@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - // we need to make sure that the stack stays aligned to 16 bytes u32 stackAlloc = ((regsCount + 1) & ~1) * 8; - MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + Comp_AddCycles_CDI(); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + Comp_AddCycles_CD(); + if (regsCount & 1) PUSH(RSCRATCH); @@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc return offset; } -OpArg Compiler::A_Comp_GetMemWBOffset() -{ - if (!(CurInstr.Instr & (1 << 25))) - { - u32 imm = CurInstr.Instr & 0xFFF; - return Imm32(imm); - } - else - { - int op = (CurInstr.Instr >> 5) & 0x3; - int amount = (CurInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurInstr.A_Reg(0)); - bool carryUsed; - - return Comp_RegShiftImm(op, amount, rm, false, carryUsed); - } -} void Compiler::A_Comp_MemWB() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); bool load = CurInstr.Instr & (1 << 20); bool byte = CurInstr.Instr & (1 << 22); int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; - if (CurInstr.Instr & (1 << 24)) + ComplexOperand offset; + if (!(CurInstr.Instr & (1 << 25))) { - OpArg offset = A_Comp_GetMemWBOffset(); - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); + offset = ComplexOperand(CurInstr.Instr & 0xFFF); } else - MOV(32, R(ABI_PARAM1), rn); - - if (!(CurInstr.Instr & (1 << 24))) { - OpArg offset = A_Comp_GetMemWBOffset(); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); + offset = ComplexOperand(rm, op, amount); } - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); - if (load && CurInstr.A_Reg(12) == 15) - { - if (byte) - printf("!!! LDRB PC %08X\n", R15); - else - { - if (Num == 1) - AND(32, rd, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rd.GetSimpleReg()); - } - } + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::A_Comp_MemHalf() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); - - OpArg offset = CurInstr.Instr & (1 << 22) - ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : MapReg(CurInstr.A_Reg(0)); + ComplexOperand offset = CurInstr.Instr & (1 << 22) + ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : ComplexOperand(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf() if (size == 32 && Num == 1) return; // NOP - if (CurInstr.Instr & (1 << 24)) - { - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); - } - else - MOV(32, R(ABI_PARAM1), rn); - + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; if (!(CurInstr.Instr & (1 << 24))) - { - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); - } + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; - Comp_MemAccess(rd, signExtend, !load, size); - - if (load && CurInstr.A_Reg(12) == 15) - printf("!!! MemHalf op PC %08X\n", R15);; + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; bool byte = op & 0x1; - MOV_sum(32, ABI_PARAM1, rb, ro); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::A_Comp_LDM_STM() @@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM() void Compiler::T_Comp_MemImm() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::T_Comp_MemRegHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op != 0; int size = op != 1 ? 16 : 8; bool signExtend = op & 1; - MOV_sum(32, ABI_PARAM1, rb, ro); + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; - Comp_MemAccess(rd, signExtend, !load, size); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + size, flags); } void Compiler::T_Comp_MemImmHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 16); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + load ? 0 : memop_Store); } void Compiler::T_Comp_LoadPCRel() { - OpArg rd = MapReg(CurInstr.T_Reg(8)); u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - // hopefully this doesn't break - u32 val; CurCPU->DataRead32(addr, &val); - MOV(32, rd, Imm32(val)); + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); } void Compiler::T_Comp_MemSPRel() { u32 offset = (CurInstr.Instr & 0xFF) * 4; - OpArg rd = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 32); + Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + load ? 0 : memop_Store); } void Compiler::T_Comp_PUSH_POP() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 9239e29..0fbde26 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -36,7 +36,7 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMemory = 1 << 20, + A_WriteMem = 1 << 20 }; #define A_BIOP A_Read16 @@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak( const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); -const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); @@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 | A_WriteMemory +#define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double | A_WriteMemory +#define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 21) +#define tk(x) ((x) << 22) enum { T_Read0 = 1 << 0, @@ -210,6 +210,8 @@ enum { T_SetMaybeC = 1 << 18, T_ReadC = 1 << 19, T_SetC = 1 << 20, + + T_WriteMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); -const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG); -const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG); +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); -const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); -const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM); +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); -const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); -const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); -const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); -const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); @@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 21) & 0x3F; + res.Kind = (data >> 22) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_SetC) res.WriteFlags |= flag_C; + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 id = (cn<<8)|(cm<<4)|cpinfo; if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) { @@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) res.WriteFlags |= flag_C; + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d01c600..d02f168 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -226,18 +226,27 @@ enum flag_V = 1 << 0, }; +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_WaitForInterrupt +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 SpecialKind; + u8 ReadFlags; // lower 4 bits - set always // upper 4 bits - might set flag u8 WriteFlags; bool EndBlock; - bool Branches() + bool Branches() const { return DstRegs & (1 << 15); } diff --git a/src/CP15.cpp b/src/CP15.cpp index e6e91c3..10c3b1b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + ARMJIT::InvalidateAll(); ICacheInvalidateAll(); return; case 0x751: + ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); ICacheInvalidateByAddr(val); return; case 0x752: @@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } diff --git a/src/Config.cpp b/src/Config.cpp index 3cff0ed..63d61a3 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,7 @@ int GL_Antialias; #ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +bool JIT_BrancheOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c13eae3..0fcefc3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -49,6 +49,7 @@ extern int GL_Antialias; #ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +extern bool JIT_BrancheOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 1baa308..e9e6795 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -536,7 +536,7 @@ void Reset() RCnt = 0; #ifdef JIT_ENABLED - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); #endif NDSCart::Reset(); @@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file) #ifdef JIT_ENABLED if (!file->Saving) { - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); } #endif @@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate32<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 09ea8eb..45e8e0c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot; #ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +uiCheckbox* cbJITBranchOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg) bool enableJit = uiCheckboxChecked(cbJITEnabled); char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); + bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || + branchOptimisations != Config::JIT_BrancheOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; + Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); restart = true; } @@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg) void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) + { uiControlEnable(uiControl(enJITMaxBlockSize)); + uiControlEnable(uiControl(cbJITBranchOptimisations)); + } else + { uiControlDisable(uiControl(enJITMaxBlockSize)); + uiControlDisable(uiControl(cbJITBranchOptimisations)); + } } #endif @@ -159,6 +169,14 @@ void Open() enJITMaxBlockSize = uiNewEntry(); uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); + } } #endif @@ -194,6 +212,8 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); + + uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); #endif uiControlShow(uiControl(win)); -- cgit v1.2.3 From 7424f9fda06bd15f0e00717b962a5ca8a00540b7 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:14:33 +0200 Subject: remove leftover debug code --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 -- 1 file changed, 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 25c55a3..a994d34 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -598,8 +598,6 @@ void Compiler::Comp_AddCycles_CDI() cycles = numC + numD + 1; } - printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); - if (!Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else -- cgit v1.2.3 From aa23f21b8df9780578adf6e6ea6bcfba3fee83bb Mon Sep 17 00:00:00 2001 From: RSDuck Date: Wed, 16 Oct 2019 23:39:12 +0200 Subject: decrease jit block cache address granularity fixes Dragon Quest IX move code with side effects out of assert, fixes release build (thanks to m4wx for this one) also remove some leftovers of jit pipelining --- src/ARMJIT.cpp | 42 ++++++++++++++++++++++--------------- src/ARMJIT_Internal.h | 3 +-- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 31 ++++++++++++++------------- src/ARM_InstrInfo.cpp | 25 ++++++++++++++-------- src/ARM_InstrInfo.h | 3 ++- src/libui_sdl/main.cpp | 2 ++ 6 files changed, 62 insertions(+), 44 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 686bdd6..19a5e70 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -106,7 +106,7 @@ u32 AddrTranslate9[0x2000]; u32 AddrTranslate7[0x4000]; JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; -AddressRange CodeRanges[ExeMemSpaceSize / 256]; +AddressRange CodeRanges[ExeMemSpaceSize / 512]; TinyVector JitBlocks; JitBlock* RestoreCandidates[0x1000] = {NULL}; @@ -285,6 +285,13 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = #undef F_MEM_HD #undef F +void T_BL_LONG(ARM* cpu) +{ + ARMInterpreter::T_BL_LONG_1(cpu); + cpu->R[15] += 2; + ARMInterpreter::T_BL_LONG_2(cpu); +} + #define F(x) ARMInterpreter::T_##x InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = { @@ -302,7 +309,7 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = F(PUSH), F(POP), F(LDMIA), F(STMIA), F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), F(UNK), F(SVC), - NULL // BL_LONG psudo opcode + T_BL_LONG // BL_LONG psudo opcode }; #undef F @@ -341,7 +348,7 @@ void CompileBlock(ARM* cpu) JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), - CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; @@ -352,7 +359,7 @@ void CompileBlock(ARM* cpu) instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; - instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; + nextInstr[0] = nextInstr[1]; instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; @@ -361,7 +368,7 @@ void CompileBlock(ARM* cpu) u32 translatedAddr = (cpu->Num == 0 ? TranslateAddr<0>(instrs[i].Addr) - : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + : TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF; if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) { bool returning = false; @@ -400,7 +407,6 @@ void CompileBlock(ARM* cpu) nextInstr[1] = cpuv4->CodeRead32(r15); instrs[i].CodeCycles = cpu->CodeCycles; } - instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); cpu->R[15] = r15; @@ -584,7 +590,7 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numAddressRanges; j++) { assert(addresseRanges[j] == block->AddressRanges()[j]); - CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); } FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; @@ -595,7 +601,7 @@ void CompileBlock(ARM* cpu) void InvalidateByAddr(u32 pseudoPhysical) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); - AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + AddressRange* range = &CodeRanges[pseudoPhysical / 512]; int startLength = range->Blocks.Length; for (int i = 0; i < range->Blocks.Length; i++) { @@ -604,15 +610,17 @@ void InvalidateByAddr(u32 pseudoPhysical) for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - if ((addr / 256) != (pseudoPhysical / 256)) + if ((addr / 512) != (pseudoPhysical / 512)) { - AddressRange* otherRange = &CodeRanges[addr / 256]; + AddressRange* otherRange = &CodeRanges[addr / 512]; assert(otherRange != range); - assert(otherRange->Blocks.RemoveByValue(block)); + bool removed = otherRange->Blocks.RemoveByValue(block); + assert(removed); } } - assert(JitBlocks.RemoveByValue(block)); + bool removed = JitBlocks.RemoveByValue(block); + assert(removed); FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; @@ -631,14 +639,14 @@ void InvalidateByAddr(u32 pseudoPhysical) void InvalidateByAddr7(u32 addr) { u32 pseudoPhysical = TranslateAddr<1>(addr); - if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false)) InvalidateByAddr(pseudoPhysical); } void InvalidateITCM(u32 addr) { u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; - if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0) InvalidateByAddr(pseudoPhysical); } @@ -654,7 +662,7 @@ void InvalidateAll() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - AddressRange* range = &CodeRanges[addr / 256]; + AddressRange* range = &CodeRanges[addr / 512]; range->Blocks.Clear(); if (range->TimesInvalidated + 1 > range->TimesInvalidated) range->TimesInvalidated++; @@ -689,8 +697,8 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 256].Blocks.Clear(); - CodeRanges[addr / 256].TimesInvalidated = 0; + CodeRanges[addr / 512].Blocks.Clear(); + CodeRanges[addr / 512].TimesInvalidated = 0; } delete block; } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 4acb488..9e6713d 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -38,7 +38,6 @@ struct FetchedInstr u8 BranchFlags; u8 SetFlags; u32 Instr; - u32 NextInstr[2]; u32 Addr; u8 CodeCycles; @@ -185,7 +184,7 @@ struct __attribute__((packed)) AddressRange u16 TimesInvalidated; }; -extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; +extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 13ca415..eb01c87 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -105,7 +105,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) static_assert(sizeof(AddressRange) == 16); LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(8)); + SHR(32, R(RSCRATCH), Imm8(9)); SHL(32, R(RSCRATCH), Imm8(4)); CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); FixupBranch noCode = J_CC(CC_Z); @@ -203,7 +203,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); MOV(32, R(ABI_PARAM4), R(RSCRATCH)); - SHR(32, R(RSCRATCH), Imm8(8)); + SHR(32, R(RSCRATCH), Imm8(9)); SHL(32, R(RSCRATCH), Imm8(4)); CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); FixupBranch noCode = J_CC(CC_Z); @@ -284,28 +284,29 @@ void fault(u32 a, u32 b) void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - if (flags & memop_Store) - { - Comp_AddCycles_CD(); - } - else - { - Comp_AddCycles_CDI(); - } - u32 addressMask = ~0; if (size == 32) addressMask = ~3; if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { - Comp_MemLoadLiteral(size, rd, - R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + Comp_MemLoadLiteral(size, rd, addr); + return; } - else + { + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } + OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 0fbde26..1261bbe 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -5,7 +5,7 @@ namespace ARMInstrInfo { -#define ak(x) ((x) << 21) +#define ak(x) ((x) << 22) enum { A_Read0 = 1 << 0, @@ -36,7 +36,8 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMem = 1 << 20 + A_WriteMem = 1 << 20, + A_LoadMem = 1 << 21 }; #define A_BIOP A_Read16 @@ -122,7 +123,7 @@ const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB); const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); -#define A_LDR A_Write12 +#define A_LDR A_Write12 | A_LoadMem #define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ @@ -143,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(STRB,STR) A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) -#define A_LDRD A_Write12Double +#define A_LDRD A_Write12Double | A_LoadMem #define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ @@ -159,10 +160,10 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_LoadMem | A_WriteMem | ak(ak_SWPB); -const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); +const u32 A_LDM = A_Read16 | A_MemWriteback | A_LoadMem | ak(ak_LDM); const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); @@ -360,6 +361,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_WriteMem) res.SpecialKind = special_WriteMem; + + if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) + res.SpecialKind = special_LoadLiteral; res.EndBlock |= res.Branches(); @@ -377,7 +381,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_UnkOnARM7 && num != 0) data = A_UNK; - res.Kind = (data >> 21) & 0x1FF; + res.Kind = (data >> 22) & 0x1FF; if (res.Kind == ak_MCR) { @@ -454,12 +458,15 @@ Info Decode(bool thumb, u32 num, u32 instr) res.ReadFlags |= flag_C; if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F)) res.ReadFlags |= flag_C; - if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) + if ((data & A_SetC) || ((data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F))) res.WriteFlags |= flag_C; if (data & A_WriteMem) res.SpecialKind = special_WriteMem; + if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) + res.SpecialKind = special_LoadLiteral; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d02f168..c032a4f 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -230,7 +230,8 @@ enum { special_NotSpecialAtAll = 0, special_WriteMem, - special_WaitForInterrupt + special_WaitForInterrupt, + special_LoadLiteral }; struct Info diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index 0066668..c3db88d 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,6 +2675,8 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { + freopen("miauz.txt", "w", stdout); + srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From 81f38c14be0d9ba5a3da8f67d9719ed2c47279c5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 18 Oct 2019 13:29:17 +0200 Subject: integrate changes from ARM64 backend and more - better handle LDM/STM in reg alloc - unify Halted and IRQ in anticipation for branch inlining - literal optimisations can be disabled in gui - jit blocks follow simple returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in Pokemon White) --- src/ARM.cpp | 40 ++++++++++++++++++----------- src/ARM.h | 13 +++++++--- src/ARMJIT.cpp | 50 +++++++++++++++++++++++++++++++------ src/ARMJIT_RegisterCache.h | 33 +++++++++++++++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 7 +++--- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++---- src/ARM_InstrInfo.cpp | 28 +++++++++++++++++++++ src/ARM_InstrInfo.h | 2 +- src/Config.cpp | 2 ++ src/Config.h | 1 + src/NDS.cpp | 2 +- src/libui_sdl/DlgEmuSettings.cpp | 31 ++++++++++++++++++++--- src/libui_sdl/main.cpp | 2 -- 13 files changed, 179 insertions(+), 48 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 423c940..4fab60e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -113,7 +113,7 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + file->Var32(&StopExecution); file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -589,16 +589,21 @@ void ARMv5::ExecuteJIT() NDS::ARM9Timestamp += Cycles; Cycles = 0; - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM9Timestamp = NDS::ARM9Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; } - break; } } @@ -726,16 +731,21 @@ void ARMv4::ExecuteJIT() Cycles = 0; // TODO optimize this shit!!! - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM7Timestamp = NDS::ARM7Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; } - break; } } diff --git a/src/ARM.h b/src/ARM.h index 8a01068..e252d23 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -112,9 +112,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 19a5e70..0695b85 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -16,11 +16,13 @@ #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" +#include "NDSCart.h" namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* compiler; @@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); @@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } else { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) @@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } return false; } @@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu) CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; do { @@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; + if (instrs[i].Info.DstRegs & (1 << 14)) + hasLink = false; + if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); @@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu) { bool hasBranched = cpu->R[15] != r15; - u32 cond, target; - bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) @@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu) if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop - u32 offset = (target - blockAddr) / (thumb ? 2 : 4); - if (IsIdleLoop(instrs + offset, i - offset + 1)) + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } - else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 ? TranslateAddr<0>(target) : TranslateAddr<1>(target); + + if (link) + { + lr = linkAddr; + hasLink = true; + } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); @@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu) bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); JitBlock* prevBlock = RestoreCandidates[restoreSlot]; @@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if ((addr & 0xFF000000) == 0x04000000) { + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + /* unfortunately we can't map GPU2D this way since it's hidden inside an object diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index ed6a2b7..2222bc2 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -93,10 +93,12 @@ public: void Prepare(bool thumb, int i) { + FetchedInstr instr = Instrs[i]; + if (LoadedRegs & (1 << 15)) UnloadRegister(15); - BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); for (int reg : invalidedLiterals) UnloadLiteral(reg); @@ -108,6 +110,7 @@ public: { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); for (int reg : regsNeeded) ranking[reg]++; } @@ -117,8 +120,8 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -143,13 +146,31 @@ public: loadedSet.m_val = LoadedRegs; } + // we don't need to load a value which is always going to be overwritten BitSet16 needValueLoaded(needToBeLoaded); - if (thumb || Instr.Cond() >= 0xE) - needValueLoaded = BitSet16(Instr.Info.SrcRegs); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + writeRegs |= (1 << reg) & instr.Info.DstRegs; + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + + DirtyRegs |= writeRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index a994d34..fd38724 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -364,7 +364,7 @@ void Compiler::Reset() void Compiler::Comp_SpecialBranchBehaviour() { if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); if (CurInstr.BranchFlags & branch_FollowCondNotTaken) { @@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] { CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); @@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { - IrregularCycles = true; - s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; @@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD() IrregularCycles = true; } - if (!Thumb && CurInstr.Cond() < 0xE) + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index eb01c87..3799774 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,5 +1,6 @@ #include "ARMJIT_Compiler.h" +#include "../Config.h" using namespace Gen; @@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); Comp_MemLoadLiteral(size, rd, addr); @@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); bool inlinePreparation = Num == 1; u32 constLocalROR32 = 4; @@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) { u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); @@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (Config::JIT_LiteralOptimisations) + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + else + Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 1261bbe..8f8bd35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + res.NotStrictlyNeeded |= set; + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set; + res.SrcRegs |= set; + } + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) res.SpecialKind = special_LoadLiteral; + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + res.NotStrictlyNeeded |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + res.NotStrictlyNeeded |= set; + } if ((instr >> 28) < 0xE) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index c032a4f..2732181 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -236,7 +236,7 @@ enum struct Info { - u16 DstRegs, SrcRegs; + u16 DstRegs, SrcRegs, NotStrictlyNeeded; u16 Kind; u8 SpecialKind; diff --git a/src/Config.cpp b/src/Config.cpp index 63d61a3..eb5bfcc 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -38,6 +38,7 @@ int GL_Antialias; bool JIT_Enable = false; int JIT_MaxBlockSize = 12; bool JIT_BrancheOptimisations = true; +bool JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -52,6 +53,7 @@ ConfigEntry ConfigFile[] = {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 0fcefc3..723ab13 100644 --- a/src/Config.h +++ b/src/Config.h @@ -50,6 +50,7 @@ extern int GL_Antialias; extern bool JIT_Enable; extern int JIT_MaxBlockSize; extern bool JIT_BrancheOptimisations; +extern bool JIT_LiteralOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index e9e6795..141c565 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1142,7 +1142,7 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); } else { diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 45e8e0c..0df9c6c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -43,6 +43,7 @@ uiCheckbox* cbDirectBoot; uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; uiCheckbox* cbJITBranchOptimisations; +uiCheckbox* cbJITLiteralOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -66,14 +67,16 @@ void OnOk(uiButton* btn, void* blarg) char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || - branchOptimisations != Config::JIT_BrancheOptimisations) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize + || branchOptimisations != Config::JIT_BrancheOptimisations + || literalOptimisations != Config::JIT_LiteralOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -82,7 +85,8 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; - Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + Config::JIT_BrancheOptimisations = branchOptimisations; + Config::JIT_LiteralOptimisations = literalOptimisations; restart = true; } @@ -108,11 +112,13 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg) { uiControlEnable(uiControl(enJITMaxBlockSize)); uiControlEnable(uiControl(cbJITBranchOptimisations)); + uiControlEnable(uiControl(cbJITLiteralOptimisations)); } else { uiControlDisable(uiControl(enJITMaxBlockSize)); uiControlDisable(uiControl(cbJITBranchOptimisations)); + uiControlDisable(uiControl(cbJITLiteralOptimisations)); } } #endif @@ -174,9 +180,25 @@ void Open() uiBox* row = uiNewHorizontalBox(); uiBoxAppend(in_ctrl, uiControl(row), 0); - cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:"); + uiBoxAppend(row, uiControl(lbl), 0); + } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations"); uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations"); + uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0); + } } #endif @@ -214,6 +236,7 @@ void Open() OnJITStateChanged(cbJITEnabled, NULL); uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); + uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations); #endif uiControlShow(uiControl(win)); diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index c3db88d..0066668 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { - freopen("miauz.txt", "w", stdout); - srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From 386100c053adad10ab7de066d37f383d58d5cfa1 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 3 Nov 2019 15:33:20 +0100 Subject: make literal optimisation more reliable fixes spanish Pokemon HeartGold --- src/ARMJIT.cpp | 52 +++++++++++++++++++++++++++++++++---- src/ARMJIT.h | 2 +- src/ARMJIT_Internal.h | 3 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 34 +++++++++++++++++++----- 4 files changed, 77 insertions(+), 14 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 0695b85..c7387c9 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -161,6 +161,27 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } +bool DecodeLiteral(const FetchedInstr& instr, u32& addr) +{ + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_STR_IMM: + case ARMInstrInfo::ak_STRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_STRD_IMM: + case ARMInstrInfo::ak_STRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever} + addr = instr.Addr + 8; + return true; + default: + JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr); + return false; + } +} + bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, u32& linkAddr, u32& targetAddr) { @@ -463,6 +484,23 @@ void CompileBlock(ARM* cpu) instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem + && instrs[i].Info.SrcRegs == (1 << 15) + && instrs[i].Info.DstRegs == 0) + { + assert (!thumb); + + u32 addr; + if (DecodeLiteral(instrs[i], addr)) + { + JIT_DEBUGPRINT("pc relative write detected\n"); + u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + + ARMJIT::InvalidateByAddr(translatedAddr, false); + CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16)); + } + } + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -631,7 +669,7 @@ void CompileBlock(ARM* cpu) JitBlocks.Add(block); } -void InvalidateByAddr(u32 pseudoPhysical) +void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); AddressRange* range = &CodeRanges[pseudoPhysical / 512]; @@ -657,11 +695,14 @@ void InvalidateByAddr(u32 pseudoPhysical) FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; + if (mayRestore) + { + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; - RestoreCandidates[slot] = block; + RestoreCandidates[slot] = block; + } } if ((range->TimesInvalidated + 1) > range->TimesInvalidated) range->TimesInvalidated++; @@ -732,6 +773,7 @@ void ResetBlockCache() u32 addr = block->AddressRanges()[j]; CodeRanges[addr / 512].Blocks.Clear(); CodeRanges[addr / 512].TimesInvalidated = 0; + CodeRanges[addr / 512].InvalidLiterals = 0; } delete block; } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 1db4d66..09cc463 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -61,7 +61,7 @@ inline JitBlockEntry LookUpBlock(u32 addr) void Init(); void DeInit(); -void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true); void InvalidateAll(); void InvalidateITCM(u32 addr); diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 9e6713d..fb05f75 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -63,7 +63,7 @@ struct __attribute__((packed)) TinyVector { T* Data = NULL; u16 Capacity = 0; - u32 Length = 0; // make it 32 bit so we don't need movzx + u16 Length = 0; ~TinyVector() { @@ -181,6 +181,7 @@ private: struct __attribute__((packed)) AddressRange { TinyVector Blocks; + u16 InvalidLiterals; u16 TimesInvalidated; }; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 3799774..82f80a7 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -108,7 +108,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) MOV(32, R(RSCRATCH), R(ABI_PARAM1)); SHR(32, R(RSCRATCH), Imm8(9)); SHL(32, R(RSCRATCH), Imm8(4)); - CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); FixupBranch noCode = J_CC(CC_Z); JMP((u8*)InvalidateByAddr, true); SetJumpTarget(noCode); @@ -206,7 +206,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(ABI_PARAM4), R(RSCRATCH)); SHR(32, R(RSCRATCH), Imm8(9)); SHL(32, R(RSCRATCH), Imm8(4)); - CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); FixupBranch noCode = J_CC(CC_Z); ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); @@ -278,10 +278,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) Comp_AddCycles_CDI(); } -void fault(u32 a, u32 b) +/*void fault(u32 a, u32 b, u32 c, u32 d) { - printf("actually not static! %x %x\n", a, b); -} + printf("actually not static! %x %x %x %x\n", a, b, c, d); +}*/ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { @@ -291,11 +291,17 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; + //bool check = false; if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - Comp_MemLoadLiteral(size, rd, addr); - return; + u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + + if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) + { + Comp_MemLoadLiteral(size, rd, addr); + return; + } } { @@ -438,6 +444,20 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz CALL(memoryFunc); + /*if (Num == 0 && check) + { + CMP(32, R(EAX), rdMapped); + FixupBranch notEqual = J_CC(CC_E); + ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0); + MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8))); + MOV(32, R(ABI_PARAM2), R(EAX)); + MOV(32, R(ABI_PARAM3), rdMapped); + MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr)); + CALL((u8*)fault); + ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0); + SetJumpTarget(notEqual); + }*/ + if (!(flags & memop_Store)) { if (inlinePreparation && size == 32) -- cgit v1.2.3 From 60650fa82e03dc8eb2a6118ce4cf2e4b0aa872e5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 6 Dec 2019 22:16:23 +0100 Subject: disable literal optimations in DTCM --- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 82f80a7..b66f304 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -347,8 +347,10 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz // stupid dtcm... if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) { - region.Mem = cpu5->DTCM; - region.Mask = 0x3FFF; + // disable this for now as DTCM is located in heap + // which might excced the RIP-addressable range + //region.Mem = cpu5->DTCM; + //region.Mask = 0x3FFF; } else { -- cgit v1.2.3 From 9b98b8816a1dc1373ce9a57aef845263456702c3 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 4 Feb 2020 17:28:51 +0100 Subject: improve nop handling and proper behaviour for LDM^ fixes dslinux --- src/ARM.cpp | 2 ++ src/ARMJIT.cpp | 13 +++++++++---- src/ARMJIT_RegisterCache.h | 2 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 6 +++--- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 1 + src/ARMJIT_x64/ARMJIT_Compiler.h | 2 ++ src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 5 +++-- src/ARM_InstrInfo.cpp | 2 ++ src/ARM_InstrInfo.h | 2 ++ 9 files changed, 25 insertions(+), 10 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 9ab9546..07cc472 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -725,6 +725,8 @@ void ARMv4::ExecuteJIT() return; } + //printf("executing armv4 at %08x\n", instrAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); if (block) Cycles += block(); diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index c7387c9..8fd7708 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -273,6 +273,8 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) typedef void (*InterpreterFunc)(ARM* cpu); +void NOP(ARM* cpu) {} + #define F(x) &ARMInterpreter::A_##x #define F_ALU(name, s) \ F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ @@ -320,7 +322,8 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = F(LDM), F(STM), F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), - F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC), + NOP }; #undef F_ALU #undef F_MEM_WB @@ -387,8 +390,8 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", - blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n", + blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); @@ -473,7 +476,9 @@ void CompileBlock(ARM* cpu) else { u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); - assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] + || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 2222bc2..b894657 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -152,7 +152,7 @@ public: needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); - } + } { BitSet16 loadedSet(LoadedRegs); BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 0dedb3f..e02865d 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -134,7 +134,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { IrregularCycles = true; - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); bool previouslyDirty = CPSRDirty; SaveCPSR(); @@ -156,12 +156,12 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) if (!restoreCPSR) XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); else - MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); + MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste if (Num == 0) CALL((void*)&ARMv5::JumpTo); else CALL((void*)&ARMv4::JumpTo); - + if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) { for (int reg : hiRegsLoaded) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fd38724..5afe842 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -308,6 +308,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), // system stuff NULL, NULL, NULL, NULL, NULL, NULL, NULL, + F(Nop) }; const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 792ff66..2cb57dc 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -79,6 +79,8 @@ public: opInvertOp2 = 1 << 5, }; + void Nop() {} + void A_Comp_Arith(); void A_Comp_MovOp(); void A_Comp_CmpOp(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b66f304..4cafc1c 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -531,7 +531,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { if (regs[reg]) { - if (usermode && reg >= 8 && reg < 15) + if (usermode && !regs[15] && reg >= 8 && reg < 15) { if (firstUserMode) { @@ -545,7 +545,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc FixupBranch sucessfulWritten = J_CC(CC_NC); if (RegCache.Mapping[reg] != INVALID_REG) MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); - SaveReg(reg, ABI_PARAM3); + else + SaveReg(reg, ABI_PARAM3); SetJumpTarget(sucessfulWritten); } else if (RegCache.Mapping[reg] == INVALID_REG) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 8f8bd35..08e2f0a 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -392,6 +392,8 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)]; if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; + else if ((instr >> 28) == 0xF) + data = ak(ak_Nop); if (data & A_UnkOnARM7 && num != 0) data = A_UNK; diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 2732181..6ab4929 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -139,6 +139,8 @@ enum ak_MRC, ak_SVC, + ak_Nop, + ak_Count, tk_LSL_IMM = 0, -- cgit v1.2.3 From 5d0f244f3c86c2b1c65566bffa3972ae1dbac27b Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 16 Apr 2020 16:40:29 +0200 Subject: include more information in DataRegion --- src/ARM.h | 16 ++++++++-------- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 4 ++-- src/ARMJIT_Internal.h | 7 +++++-- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++-- src/CP15.cpp | 12 ++++++++++++ 5 files changed, 29 insertions(+), 14 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.h b/src/ARM.h index 8282c01..7767095 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -308,7 +308,7 @@ public: void DataRead8(u32 addr, u32* val) { *val = NDS::ARM7Read8(addr); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -317,7 +317,7 @@ public: addr &= ~1; *val = NDS::ARM7Read16(addr); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -326,7 +326,7 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -341,7 +341,7 @@ public: void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -350,7 +350,7 @@ public: addr &= ~1; NDS::ARM7Write16(addr, val); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -359,7 +359,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataRegion = addr >> 24; + DataRegion = addr >> 20; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -390,7 +390,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if (DataRegion == 0x02) // mainRAM + if ((DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) Cycles += numC + numD; @@ -417,7 +417,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if (DataRegion == 0x02) + if ((DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) Cycles += numC + numD; diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 513c117..00fa436 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if (CurInstr.DataRegion == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; @@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if (CurInstr.DataRegion == 0x02) + if ((CurInstr.DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) cycles += numC + numD; diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index b968dcb..0d6add9 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -40,9 +40,9 @@ struct FetchedInstr u32 Instr; u32 Addr; - u8 CodeCycles; u8 DataCycles; - u8 DataRegion; + u16 CodeCycles; + u32 DataRegion; ARMInstrInfo::Info Info; }; @@ -195,6 +195,9 @@ typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; +extern u8 MemRegion9[0x80000]; +extern u8 MemRegion7[0x80000]; + void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 5afe842..d69bdff 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -578,7 +578,7 @@ void Compiler::Comp_AddCycles_CDI() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if (CurInstr.DataRegion == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; @@ -623,7 +623,7 @@ void Compiler::Comp_AddCycles_CD() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if (CurInstr.DataRegion == 0x02) + if ((CurInstr.DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) cycles += numC + numD; diff --git a/src/CP15.cpp b/src/CP15.cpp index 10c3b1b..8bb4f6b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -728,6 +728,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) void ARMv5::DataRead8(u32 addr, u32* val) { + DataRegion = addr >> 12; + if (addr < ITCMSize) { DataCycles = 1; @@ -747,6 +749,8 @@ void ARMv5::DataRead8(u32 addr, u32* val) void ARMv5::DataRead16(u32 addr, u32* val) { + DataRegion = addr >> 12; + addr &= ~1; if (addr < ITCMSize) @@ -768,6 +772,8 @@ void ARMv5::DataRead16(u32 addr, u32* val) void ARMv5::DataRead32(u32 addr, u32* val) { + DataRegion = addr >> 12; + addr &= ~3; if (addr < ITCMSize) @@ -810,6 +816,8 @@ void ARMv5::DataRead32S(u32 addr, u32* val) void ARMv5::DataWrite8(u32 addr, u8 val) { + DataRegion = addr >> 12; + if (addr < ITCMSize) { DataCycles = 1; @@ -832,6 +840,8 @@ void ARMv5::DataWrite8(u32 addr, u8 val) void ARMv5::DataWrite16(u32 addr, u16 val) { + DataRegion = addr >> 12; + addr &= ~1; if (addr < ITCMSize) @@ -856,6 +866,8 @@ void ARMv5::DataWrite16(u32 addr, u16 val) void ARMv5::DataWrite32(u32 addr, u32 val) { + DataRegion = addr >> 12; + addr &= ~3; if (addr < ITCMSize) -- cgit v1.2.3 From 3787bab1f69ae22d3e8106d70598ce923e5efe70 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 13:40:51 +0200 Subject: implement block linking + some refactoring currently only supported for x64 --- .gitignore | 2 + src/ARM.cpp | 37 +- src/ARM.h | 32 +- src/ARMJIT.cpp | 223 +++- src/ARMJIT.h | 10 +- src/ARMJIT_Internal.h | 24 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 23 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 140 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_GenOffsets.cpp | 15 + src/ARMJIT_x64/ARMJIT_Linkage.s | 74 ++ src/ARMJIT_x64/ARMJIT_Offsets.h | 3 + src/CMakeLists.txt | 7 + src/Config.cpp | 8 +- src/Config.h | 6 +- src/xxhash/xxh3.h | 2390 ++++++++++++++++++++++++++++++++++ src/xxhash/xxhash.c | 43 + src/xxhash/xxhash.h | 1965 ++++++++++++++++++++++++++++ 18 files changed, 4871 insertions(+), 150 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_GenOffsets.cpp create mode 100644 src/ARMJIT_x64/ARMJIT_Linkage.s create mode 100644 src/ARMJIT_x64/ARMJIT_Offsets.h create mode 100644 src/xxhash/xxh3.h create mode 100644 src/xxhash/xxhash.c create mode 100644 src/xxhash/xxhash.h (limited to 'src/ARMJIT_x64') diff --git a/.gitignore b/.gitignore index dd81614..3c87740 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ melon_grc.h cmake-build cmake-build-debug .idea + +*.exe diff --git a/src/ARM.cpp b/src/ARM.cpp index 9ab9546..32cb91c 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -206,15 +206,15 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x2) { NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; } else { NextInstr[0] = CodeRead32(addr, true); NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; + Cycles -= CodeCycles; } CPSR |= 0x20; @@ -227,9 +227,9 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; + Cycles -= CodeCycles; NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; + Cycles -= CodeCycles; CPSR &= ~0x20; } @@ -272,7 +272,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead16(addr); NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][0] + NDS::ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; } @@ -285,7 +285,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) NextInstr[0] = CodeRead32(addr); NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][2] + NDS::ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; } @@ -544,7 +544,7 @@ void ARMv5::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM9Timestamp += Cycles; + NDS::ARM9Timestamp -= Cycles; Cycles = 0; } @@ -584,14 +584,16 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp += Cycles; - Cycles = 0; + NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); if (StopExecution) { @@ -685,7 +687,7 @@ void ARMv4::Execute() }*/ if (IRQ) TriggerIRQ(); - NDS::ARM7Timestamp += Cycles; + NDS::ARM7Timestamp -= Cycles; Cycles = 0; } @@ -725,14 +727,15 @@ void ARMv4::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); if (block) - Cycles += block(); + ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp += Cycles; - Cycles = 0; + NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); // TODO optimize this shit!!! if (StopExecution) diff --git a/src/ARM.h b/src/ARM.h index 7767095..4877956 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -185,14 +185,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; + Cycles -= numC; } void AddCycles_CI(s32 numI) { // code+internal s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + Cycles -= numC + numI; } void AddCycles_CDI() @@ -203,9 +203,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void AddCycles_CD() @@ -215,9 +215,9 @@ public: s32 numD = DataCycles; //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 6, std::max(numC, numD)); //else - // Cycles += numC + numD; + // Cycles -= numC + numD; } void GetCodeMemRegion(u32 addr, NDS::MemRegion* region); @@ -375,13 +375,13 @@ public: void AddCycles_C() { // code only. this code fetch is sequential. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; } void AddCycles_CI(s32 num) { // code+internal. results in a nonseq code fetch. - Cycles += NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; + Cycles -= NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; } void AddCycles_CDI() @@ -393,21 +393,21 @@ public: if ((DataRegion >> 4) == 0x02) // mainRAM { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else { numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } } else if (CodeRegion == 0x02) { numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD + 1; + Cycles -= numC + numD + 1; } } @@ -420,17 +420,17 @@ public: if ((DataRegion >> 4) == 0x02) { if (CodeRegion == 0x02) - Cycles += numC + numD; + Cycles -= numC + numD; else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else if (CodeRegion == 0x02) { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); + Cycles -= std::max(numC + numD - 3, std::max(numC, numD)); } else { - Cycles += numC + numD; + Cycles -= numC + numD; } } }; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 208801e..cc8d4ce 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -2,6 +2,10 @@ #include #include +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" #include "Config.h" @@ -113,16 +117,101 @@ const static ExeMemKind JIT_MEM[2][32] = { u32 AddrTranslate9[0x2000]; u32 AddrTranslate7[0x4000]; -JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; AddressRange CodeRanges[ExeMemSpaceSize / 512]; -TinyVector JitBlocks; -JitBlock* RestoreCandidates[0x1000] = {NULL}; +std::unordered_map JitBlocks; -u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) +template +struct UnreliableHashTable { - return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); -} + struct Bucket + { + K KeyA, KeyB; + V ValA, ValB; + }; + + Bucket Table[Size]; + + void Reset() + { + for (int i = 0; i < Size; i++) + { + Table[i].ValA = Table[i].ValB = InvalidValue; + } + } + + UnreliableHashTable() + { + Reset(); + } + + V Insert(K key, V value) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA == value || bucket->ValB == value) + { + return InvalidValue; + } + else if (bucket->ValA == InvalidValue) + { + bucket->KeyA = key; + bucket->ValA = value; + } + else if (bucket->ValB == InvalidValue) + { + bucket->KeyB = key; + bucket->ValB = value; + } + else + { + V prevVal = bucket->ValB; + bucket->KeyB = bucket->KeyA; + bucket->ValB = bucket->ValA; + bucket->KeyA = key; + bucket->ValA = value; + return prevVal; + } + + return InvalidValue; + } + + void Remove(K key) + { + u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->KeyA == key && bucket->ValA != InvalidValue) + { + bucket->ValA = InvalidValue; + if (bucket->ValB != InvalidValue) + { + bucket->KeyA = bucket->KeyB; + bucket->ValA = bucket->ValB; + bucket->ValB = InvalidValue; + } + } + if (bucket->KeyB == key && bucket->ValB != InvalidValue) + bucket->ValB = InvalidValue; + } + + V LookUp(K addr) + { + u32 slot = XXH3_64bits(&addr, 4) & (Size - 1); + Bucket* bucket = &Table[slot]; + + if (bucket->ValA != InvalidValue && bucket->KeyA == addr) + return bucket->ValA; + if (bucket->ValB != InvalidValue && bucket->KeyB == addr) + return bucket->ValB; + + return InvalidValue; + } +}; + +UnreliableHashTable RestoreCandidates; +UnreliableHashTable FastBlockLookUp; void Init() { @@ -396,9 +485,8 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], - cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", + blockAddr, cpu->CPSR, pseudoPhysicalAddr, CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; @@ -534,6 +622,8 @@ void CompileBlock(ARM* cpu) if (staticBranch) { + instrs[i].BranchFlags |= branch_StaticTarget; + bool isBackJump = false; if (hasBranched) { @@ -604,12 +694,11 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); - u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); - JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; - if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + if (prevBlock) { - RestoreCandidates[restoreSlot] = NULL; + RestoreCandidates.Remove(pseudoPhysicalAddr); if (prevBlock->NumInstrs == i) { for (int j = 0; j < i; j++) @@ -661,7 +750,7 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -675,9 +764,8 @@ void CompileBlock(ARM* cpu) CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); } - FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - - JitBlocks.Add(block); + JitBlocks[pseudoPhysicalAddr] = block; + FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); } void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) @@ -701,18 +789,17 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) } } - bool removed = JitBlocks.RemoveByValue(block); - assert(removed); + for (int j = 0; j < block->NumLinks(); j++) + compiler->UnlinkBlock(block->Links()[j]); - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + JitBlocks.erase(block->PseudoPhysicalAddr); + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); if (mayRestore) { - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } } if ((range->TimesInvalidated + 1) > range->TimesInvalidated) @@ -738,47 +825,54 @@ void InvalidateITCM(u32 addr) void InvalidateAll() { JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; - FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - - for (int j = 0; j < block->NumAddresses; j++) + FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + + for (int i = 0; i < block->NumAddresses; i++) { - u32 addr = block->AddressRanges()[j]; + u32 addr = block->AddressRanges()[i]; AddressRange* range = &CodeRanges[addr / 512]; range->Blocks.Clear(); if (range->TimesInvalidated + 1 > range->TimesInvalidated) range->TimesInvalidated++; } + for (int i = 0; i < block->NumLinks(); i++) + compiler->UnlinkBlock(block->Links()[i]); + block->ResetLinks(); - u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); - if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) - delete RestoreCandidates[slot]; - - RestoreCandidates[slot] = block; + JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); + if (prevBlock) + delete prevBlock; } - JitBlocks.Clear(); + JitBlocks.clear(); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - - memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); - for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + + FastBlockLookUp.Reset(); + RestoreCandidates.Reset(); + for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { - if (RestoreCandidates[i]) + if (RestoreCandidates.Table[i].ValA) { - delete RestoreCandidates[i]; - RestoreCandidates[i] = NULL; + delete RestoreCandidates.Table[i].ValA; + RestoreCandidates.Table[i].ValA = NULL; + } + if (RestoreCandidates.Table[i].ValA) + { + delete RestoreCandidates.Table[i].ValB; + RestoreCandidates.Table[i].ValB = NULL; } } - for (int i = 0; i < JitBlocks.Length; i++) + for (auto it : JitBlocks) { - JitBlock* block = JitBlocks[i]; + JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -788,11 +882,43 @@ void ResetBlockCache() } delete block; } - JitBlocks.Clear(); + JitBlocks.clear(); compiler->Reset(); } +JitBlockEntry LookUpBlockEntry(u32 addr) +{ + u32 entryOffset = FastBlockLookUp.LookUp(addr); + if (entryOffset != UINT32_MAX) + return compiler->AddEntryOffset(entryOffset); + + auto block = JitBlocks.find(addr); + if (block != JitBlocks.end()) + { + FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + return block->second->EntryPoint; + } + return NULL; +} + +template +void LinkBlock(ARM* cpu, u32 codeOffset) +{ + u32 targetPseudoPhys = TranslateAddr(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); + auto block = JitBlocks.find(targetPseudoPhys); + if (block == JitBlocks.end()) + { + CompileBlock(cpu); + block = JitBlocks.find(targetPseudoPhys); + } + + JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); + + block->second->AddLink(codeOffset); + compiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) @@ -874,4 +1000,7 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) return NULL; } -} \ No newline at end of file +} + +template void ARMJIT::LinkBlock<0>(ARM*, u32); +template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 09cc463..cab385f 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -32,7 +32,6 @@ extern u32 AddrTranslate9[0x2000]; extern u32 AddrTranslate7[0x4000]; const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) @@ -52,11 +51,8 @@ inline u32 TranslateAddr(u32 addr) return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } -template -inline JitBlockEntry LookUpBlock(u32 addr) -{ - return FastBlockAccess[TranslateAddr(addr) / 2]; -} +JitBlockEntry LookUpBlockEntry(u32 addr); + void Init(); void DeInit(); @@ -73,4 +69,6 @@ void ResetBlockCache(); } +extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); + #endif \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 0d6add9..66d1808 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -15,7 +15,8 @@ enum { branch_IdleBranch = 1 << 0, branch_FollowCondTaken = 1 << 1, - branch_FollowCondNotTaken = 1 << 2 + branch_FollowCondNotTaken = 1 << 2, + branch_StaticTarget = 1 << 3, }; struct FetchedInstr @@ -76,7 +77,7 @@ struct __attribute__((packed)) TinyVector assert(capacity > Capacity); T* newMem = new T[capacity]; if (Data != NULL) - memcpy(newMem, Data, sizeof(Data) * Length); + memcpy(newMem, Data, sizeof(T) * Length); T* oldData = Data; Data = newMem; @@ -163,7 +164,6 @@ public: u32 NumInstrs; u32 NumAddresses; - u32 NumLinks; JitBlockEntry EntryPoint; @@ -171,6 +171,21 @@ public: { return &Data[0]; } u32* AddressRanges() { return &Data[NumInstrs]; } + u32* Links() + { return &Data[NumInstrs + NumAddresses]; } + + u32 NumLinks() + { return Data.Length - NumInstrs - NumAddresses; } + + void AddLink(u32 link) + { + Data.Add(link); + } + + void ResetLinks() + { + Data.SetLength(NumInstrs + NumAddresses); + } private: /* @@ -200,6 +215,9 @@ extern u8 MemRegion7[0x80000]; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); +template +void LinkBlock(ARM* cpu, u32 codeOffset); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index e02865d..cac590a 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -127,7 +127,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) @@ -135,7 +135,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) IrregularCycles = true; BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); - bool previouslyDirty = CPSRDirty; + bool cpsrDirty = CPSRDirty; SaveCPSR(); if (restoreCPSR) @@ -168,9 +168,10 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) LoadReg(reg, RegCache.Mapping[reg]); } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + LoadCPSR(); + // in case this instruction is skipped + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } void Compiler::A_Comp_BranchImm() @@ -209,20 +210,12 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); Comp_AddCycles_C(true); SetJumpTarget(skipFailed); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d69bdff..be3709e 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -1,6 +1,7 @@ #include "ARMJIT_Compiler.h" #include "../ARMInterpreter.h" +#include "../Config.h" #include @@ -15,6 +16,8 @@ using namespace Gen; +extern "C" void ARM_Ret(); + namespace ARMJIT { template <> @@ -170,6 +173,24 @@ Compiler::Compiler() RET(); } + { + CPSRDirty = true; + BranchStub[0] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<0>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + + CPSRDirty = true; + BranchStub[1] = GetWritableCodePtr(); + SaveCPSR(); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((u8*)ARMJIT::LinkBlock<1>); + LoadCPSR(); + JMP((u8*)ARM_Ret, true); + } + // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -362,23 +383,43 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -void Compiler::Comp_SpecialBranchBehaviour() +void Compiler::Comp_SpecialBranchBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); + if (taken && CurInstr.BranchFlags & branch_IdleBranch) + OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) + && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); + + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)&ARM_Ret, true); + } } } -JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... ResetBlockCache(); @@ -388,15 +429,11 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Num = cpu->Num; CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; + // CPSR might have been modified in a previous block + CPSRDirty = Config::JIT_BrancheOptimisations == 2; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - - MOV(64, R(RCPU), ImmPtr(cpu)); - - LoadCPSR(); - RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -474,7 +511,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] else (this->*comp)(); - Comp_SpecialBranchBehaviour(); + Comp_SpecialBranchBehaviour(true); if (CurInstr.Cond() < 0xE) { @@ -485,15 +522,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - RegCache.PrepareExit(); - SaveCPSR(false); - - MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); - } + Comp_SpecialBranchBehaviour(false); SetJumpTarget(skipFailed); } @@ -504,17 +533,38 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } } - if (comp == NULL && i != instrsCount - 1) + if (comp == NULL) LoadCPSR(); } RegCache.Flush(); - SaveCPSR(); - MOV(32, R(RAX), Imm32(ConstantCycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); + + if (Config::JIT_BrancheOptimisations == 2 + && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) + && (!instrs[instrsCount - 1].Info.Branches() + || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken + || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) + { + FixupBranch ret = J_CC(CC_S); + CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); + FixupBranch ret2 = J_CC(CC_NZ); + + u8* rewritePart = GetWritableCodePtr(); + NOP(5); + + MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); + JMP((u8*)BranchStub[Num], true); - ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); - RET(); + SetJumpTarget(ret); + SetJumpTarget(ret2); + JMP((u8*)ARM_Ret, true); + } + else + { + JMP((u8*)ARM_Ret, true); + } /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -525,6 +575,22 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] return res; } +void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + JMP((u8*)entry, true); + SetCodePtr(curPtr); +} + +void Compiler::UnlinkBlock(u32 offset) +{ + u8* curPtr = GetWritableCodePtr(); + SetCodePtr(ResetStart + offset); + NOP(5); + SetCodePtr(curPtr); +} + void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? @@ -532,7 +598,7 @@ void Compiler::Comp_AddCycles_C(bool forceNonConstant) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -544,7 +610,7 @@ void Compiler::Comp_AddCycles_CI(u32 i) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -558,12 +624,12 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) if (!Thumb && CurInstr.Cond() < 0xE) { LEA(32, RSCRATCH, MDisp(i, add + cycles)); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); } else { ConstantCycles += i + cycles; - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); } } @@ -599,7 +665,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -643,7 +709,7 @@ void Compiler::Comp_AddCycles_CD() } if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 2cb57dc..b428c33 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -51,7 +51,10 @@ public: void Reset(); - JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + void LinkBlock(u32 offset, JitBlockEntry entry); + void UnlinkBlock(u32 offset); + + JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -145,7 +148,7 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); - void Comp_SpecialBranchBehaviour(); + void Comp_SpecialBranchBehaviour(bool taken); void* Gen_MemoryRoutine9(bool store, int size); @@ -176,12 +179,24 @@ public: return Gen::R(RegCache.Mapping[reg]); } + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(ResetStart + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - ResetStart; + } + u8* ResetStart; u32 CodeMemSize; bool Exit; bool IrregularCycles; + void* BranchStub[2]; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2]; diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp new file mode 100644 index 0000000..9696d22 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp @@ -0,0 +1,15 @@ +#include "../ARM.h" + +int main(int argc, char* argv[]) +{ + FILE* f = fopen("ARMJIT_Offsets.h", "w"); +#define writeOffset(field) \ + fprintf(f, "#define ARM_" #field "_offset 0x%x\n", offsetof(ARM, field)) + + writeOffset(CPSR); + writeOffset(Cycles); + writeOffset(StopExecution); + + fclose(f); + return 0; +} \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s new file mode 100644 index 0000000..dbbb024 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Linkage.s @@ -0,0 +1,74 @@ +.intel_syntax noprefix + +#include "ARMJIT_Offsets.h" + +.text + +#define RCPU rbp +#define RCPSR r15d + +#ifdef WIN64 +#define ARG1_REG ecx +#define ARG2_REG edx +#define ARG3_REG r8d +#define ARG4_REG r9d +#define ARG1_REG64 rcx +#define ARG2_REG64 rdx +#define ARG3_REG64 r8 +#define ARG4_REG64 r9 +#else +#define ARG1_REG edi +#define ARG2_REG esi +#define ARG3_REG edx +#define ARG4_REG ecx +#define ARG1_REG64 rdi +#define ARG2_REG64 rsi +#define ARG3_REG64 rdx +#define ARG4_REG64 rcx +#endif + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: +#ifdef WIN64 + push rdi + push rsi +#endif + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + +#ifdef WIN64 + sub rsp, 0x28 +#endif + mov RCPU, ARG1_REG64 + mov RCPSR, [RCPU + ARM_CPSR_offset] + + jmp ARG2_REG64 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + mov [RCPU + ARM_CPSR_offset], RCPSR + +#ifdef WIN64 + add rsp, 0x28 +#endif + + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx +#ifdef WIN64 + pop rsi + pop rdi +#endif + + ret diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h new file mode 100644 index 0000000..a73dd59 --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_Offsets.h @@ -0,0 +1,3 @@ +#define ARM_CPSR_offset 0x64 +#define ARM_Cycles_offset 0xc +#define ARM_StopExecution_offset 0x10 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 912299d..f650f42 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,9 +30,13 @@ add_library(core STATIC SPU.cpp Wifi.cpp WifiAP.cpp + + xxhash/xxhash.c ) if (ENABLE_JIT) + enable_language(ASM) + target_sources(core PRIVATE ARMJIT.cpp @@ -49,7 +53,10 @@ if (ENABLE_JIT) ARMJIT_x64/ARMJIT_ALU.cpp ARMJIT_x64/ARMJIT_LoadStore.cpp ARMJIT_x64/ARMJIT_Branch.cpp + + ARMJIT_x64/ARMJIT_Linkage.s ) + set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() if (ARCHITECTURE STREQUAL ARM64) target_sources(core PRIVATE diff --git a/src/Config.cpp b/src/Config.cpp index be6a833..f3f8c6c 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -35,10 +35,10 @@ int GL_ScaleFactor; int GL_Antialias; #ifdef JIT_ENABLED -bool JIT_Enable = false; +int JIT_Enable = false; int JIT_MaxBlockSize = 12; -bool JIT_BrancheOptimisations = true; -bool JIT_LiteralOptimisations = true; +int JIT_BrancheOptimisations = 2; +int JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -52,7 +52,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, - {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif diff --git a/src/Config.h b/src/Config.h index 723ab13..fff476a 100644 --- a/src/Config.h +++ b/src/Config.h @@ -47,10 +47,10 @@ extern int GL_ScaleFactor; extern int GL_Antialias; #ifdef JIT_ENABLED -extern bool JIT_Enable; +extern int JIT_Enable; extern int JIT_MaxBlockSize; -extern bool JIT_BrancheOptimisations; -extern bool JIT_LiteralOptimisations; +extern int JIT_BrancheOptimisations; +extern int JIT_LiteralOptimisations; #endif } diff --git a/src/xxhash/xxh3.h b/src/xxhash/xxh3.h new file mode 100644 index 0000000..5d5faf8 --- /dev/null +++ b/src/xxhash/xxh3.h @@ -0,0 +1,2390 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Development source file for `xxh3` + * Copyright (C) 2019-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Note: This file is separated for development purposes. + * It will be integrated into `xxhash.h` when development stage is completed. + * + * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 + */ + +#ifndef XXH3_H_1397135465 +#define XXH3_H_1397135465 + +/* === Dependencies === */ +#ifndef XXHASH_H_5627135585666179 +/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ +# undef XXH_INLINE_ALL /* avoid redefinition */ +# define XXH_INLINE_ALL +#endif +#include "xxhash.h" + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXH_SCALAR 0 /* Portable scalar version */ +#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ +#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ +#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ +#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ +#define XXH_AVX512 5 /* AVX512 for Skylake and Icelake */ + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXH_VECTOR XXH_NEON +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator. + * This is for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/* + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) \ + && !defined(__aarch64__) && !defined(__arm64__) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +# if defined(__s390x__) +# include +# else +# include +# endif + +# undef vector /* Undo the pollution */ + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/* + * Performs an unaligned load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/* Pseudorandom secret taken directly from FARSH */ +XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * Calculates a 32-bit to 64-bit long multiply. + * + * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) + * { + * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); + * } + */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/* + * Calculates a 64->128-bit long multiply. + * + * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/* + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/* Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * We don't need to (or want to) mix as much as XXH64. + * + * Short hashes are more evenly distributed, so it isn't necessary. + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + xxh_u64 const mixed = keyed * PRIME64_1; + return XXH3_avalanche(mixed); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len < 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 x = input64 ^ bitflip; + /* this mix is inspired by Pelle Evensen's rrmxmx */ + x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); + x *= 0x9FB21C651E98DF25ULL; + x ^= (x >> 35) + len ; + x *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(x, 28); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(8 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + __asm__ ("" : "+r" (seed64)); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); + + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret, + XXH3_accWidth_e accWidth) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[0] += data_vec; */ + __m512i const sum = _mm512_add_epi64(*xacc, data_vec); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_SSE2) + + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + if (accWidth == XXH3_acc_128bits) { + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXH3_acc_64bits */ + /* xacc[i] += data_vec; */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + if (accWidth == XXH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } + +#elif (XXH_VECTOR == XXH_VSX) + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXH3_acc_128bits */ + /* swap high and low halves */ +#ifdef __s390x__ + xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); +#else + xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + + if (accWidth == XXH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ +#if (XXH_VECTOR == XXH_AVX512) + + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } + +#elif (XXH_VECTOR == XXH_AVX2) + + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_SSE2) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXH_VECTOR == XXH_NEON) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + + /* xacc[i] *= PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + } + } } + +#elif (XXH_VECTOR == XXH_VSX) + + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } + +#else /* scalar variant of Scrambler - universal */ + + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXH_PREFETCH_DIST 384 + +#ifdef __clang__ // for clang +# define XXH_PREFETCH_DIST_AVX512_64 320 +# define XXH_PREFETCH_DIST_AVX512_128 320 +#else // for gcc +# define XXH_PREFETCH_DIST_AVX512_64 640 +# define XXH_PREFETCH_DIST_AVX512_128 512 +#endif + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; +#if (XXH_VECTOR == XXH_AVX512) + if (accWidth == XXH3_acc_64bits) XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_64); + else XXH_PREFETCH(in + XXH_PREFETCH_DIST_AVX512_128); +#else + XXH_PREFETCH(in + XXH_PREFETCH_DIST); +#endif + XXH3_accumulate_512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE, + accWidth); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; + /* Do not align on 8, so that the secret is different from the scrambler */ +#define XXH_SECRET_LASTACC_START 7 + XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + __asm__("" : "+r" (result64)); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + /* + * We need a separate pointer for the hack below. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8 *kSecretPtr = kSecret; + + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that kSecretPtr has been changed), the pipelines are used more efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + __asm__("" : "+r" (kSecretPtr)); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == kSecret); + + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64(customSecret + 16*i, lo); + XXH_writeLE64(customSecret + 16*i + 8, hi); + } +} + + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) +{ + return XXH3_hashLong_64b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + return XXH3_hashLong_64b_internal(input, len, secret, secretSize); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret)); +} + +/* === Public entry point === */ + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXH3 streaming === */ + + +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + return (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); +} + +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_64bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->acc[0] = PRIME32_3; + statePtr->acc[1] = PRIME64_1; + statePtr->acc[2] = PRIME64_2; + statePtr->acc[3] = PRIME64_3; + statePtr->acc[4] = PRIME64_4; + statePtr->acc[5] = PRIME32_2; + statePtr->acc[6] = PRIME64_5; + statePtr->acc[7] = PRIME32_1; + statePtr->seed = seed; + XXH_ASSERT(secret != NULL); + statePtr->secret = secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_FORCE_INLINE void +XXH3_consumeStripes( xxh_u64* acc, + XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, + const xxh_u8* input, size_t totalStripes, + const xxh_u8* secret, size_t secretLimit, + XXH3_accWidth_e accWidth) +{ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { + /* need a scrambling operation */ + size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); + XXH3_scrambleAcc(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); + *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); + *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; + } +} + +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* const bEnd = input + len; + + state->totalLen += len; + + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + /* input is now > XXH3_INTERNALBUFFER_SIZE */ + + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ + + /* + * There is some input left inside the internal buffer. + * Fill it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + state->bufferedSize = 0; + } + + /* Consume input by full buffer quantities */ + if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + state->secret, state->secretLimit, + accWidth); + input += XXH3_INTERNALBUFFER_SIZE; + } while (input<=limit); + } + + if (input < bEnd) { /* Some remaining input: buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); + } + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= STRIPE_LEN) { + size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; + XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, totalNbStripes, + state->secret, state->secretLimit, + accWidth); + if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - STRIPE_LEN, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } + } else { /* bufferedSize < STRIPE_LEN */ + if (state->bufferedSize) { /* one last stripe */ + xxh_u8 lastStripe[STRIPE_LEN]; + size_t const catchupSize = STRIPE_LEN - state->bufferedSize; + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, + accWidth); + } } +} + +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_64bits); + return XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + } + /* len <= XXH3_MIDSIZE_MAX: short code */ + if (state->seed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + xxh_u64 const mixedl = keyed_lo * PRIME64_1; + xxh_u64 const mixedh = keyed_hi * PRIME64_5; + XXH128_hash_t h128; + h128.low64 = XXH3_avalanche(mixedl); + h128.high64 = XXH3_avalanche(mixedh); + return h128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); + h128.high64 += m128.high64 * PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); + h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * PRIME64_1) + + (acc.high64 * PRIME64_4) + + ((len - seed) * PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) +{ + return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, + const xxh_u8* secret, size_t secretSize) +{ + return XXH3_hashLong_128b_internal(input, len, secret, secretSize); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) +{ + XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); + XXH3_initCustomSecret(secret, seed); + return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); +} + + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ + +/* + * All the functions are actually the same as for 64-bit streaming variant. + * The only difference is the finalizatiom routine. + */ + +static void +XXH3_128bits_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const xxh_u8* secret, size_t secretSize) +{ + XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); + XXH3_initCustomSecret(statePtr->customSecret, seed); + statePtr->secret = statePtr->customSecret; + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); +} + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; + XXH3_digest_long(acc, state, XXH3_acc_128bits); + XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + state->secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + state->secret + state->secretLimit + STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + state->secret, state->secretLimit + STRIPE_LEN); +} + +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH3_H_1397135465 */ diff --git a/src/xxhash/xxhash.c b/src/xxhash/xxhash.c new file mode 100644 index 0000000..0fae88c --- /dev/null +++ b/src/xxhash/xxhash.c @@ -0,0 +1,43 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/xxhash/xxhash.h b/src/xxhash/xxhash.h new file mode 100644 index 0000000..67a5887 --- /dev/null +++ b/src/xxhash/xxhash.h @@ -0,0 +1,1965 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* TODO: update */ +/* Notice extracted from xxHash homepage: + +xxHash is an extremely fast hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +Note: SMHasher's CRC32 implementation is not the fastest one. +Other speed-oriented implementations can be faster, +especially in combination with PCLMUL instruction: +https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such + * as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ +# ifdef XXH_NAMESPACE +# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" + /* + * Note: Alternative: #undef all symbols (it's a pretty large list). + * Without #error: it compiles, but functions are actually not inlined. + */ +# endif +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, but they must + * still be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and is a more dispersed action. + * Meanwhile, renaming can be achieved in a single block + */ +# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +/*! + * XXH_NAMESPACE, aka Namespace Emulation: + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 7 +#define XXH_VERSION_RELEASE 4 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +# endif +#endif + +/*! + * XXH32(): + * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + * The memory between input & input+length must be valid (allocated and read-accessible). + * "seed" can be used to alter the result predictably. + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +/******* Streaming *******/ + +/* + * Streaming functions generate the xxHash value from an incrememtal input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + */ + +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +#endif + +/*! + * XXH64(): + * Returns the 64-bit hash of sequence of length @length stored at memory + * address @input. + * @seed can be used to alter the result predictably. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation of an XXH + * state, for example, on the stack or in a struct. + * Never **ever** access members directly. + */ + +struct XXH32_state_s { + XXH32_hash_t total_len_32; + XXH32_hash_t large_len; + XXH32_hash_t v1; + XXH32_hash_t v2; + XXH32_hash_t v3; + XXH32_hash_t v4; + XXH32_hash_t mem32[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +struct XXH64_state_s { + XXH64_hash_t total_len; + XXH64_hash_t v1; + XXH64_hash_t v2; + XXH64_hash_t v3; + XXH64_hash_t v4; + XXH64_hash_t mem64[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved32; /* required for padding anyway */ + XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + + +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +/* ************************************************************************ + * XXH3 is a new hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * In general, expect XXH3 to run about ~2x faster on large inputs and >3x + * faster on small ones compared to XXH64, though exact differences depend on + * the platform. + * + * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash + * on all platforms. + * + * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. + * + * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run + * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are + * explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The 128-bit version adds additional strength, but it is slightly slower. + * + * The XXH3 algorithm is still in development. + * The results it produces may still change in future versions. + * + * Results produced by v0.7.x are not comparable with results from v0.7.y. + * However, the API is completely stable, and it can safely be used for + * ephemeral data (local sessions). + * + * Avoid storing values in long-term storage until the algorithm is finalized. + * + * Since v0.7.3, XXH3 has reached "release candidate" status, meaning that, if + * everything remains fine, its current format will be "frozen" and become the + * final one. + * + * After which, return values of XXH3 and XXH128 will no longer change in + * future versions. + * + * XXH3's return values will be officially finalized upon reaching v0.8.0. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +#ifdef XXH_NAMESPACE +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) + +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) + +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +#endif + +/* XXH3_64bits(): + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional + * collision. + * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid trivial sequences, such as repeating sequences and especially '\0', + * as this can cancel out itself. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXH3_SECRET_SIZE_MIN 136 +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* + * XXH3_64bits_withSeed(): + * This variant generates a custom secret on the fly based on the default + * secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * Note: seed==0 produces the same results as XXH3_64bits(). + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); + + +/* streaming 64-bit */ + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +typedef struct XXH3_state_s XXH3_state_t; + +#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ +#define XXH3_INTERNALBUFFER_SIZE 256 +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /* used to store a custom secret generated from the seed. Makes state larger. + * Design might change */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + XXH32_hash_t bufferedSize; + XXH32_hash_t nbStripesPerBlock; + XXH32_hash_t nbStripesSoFar; + XXH32_hash_t secretLimit; + XXH32_hash_t reserved32; + XXH32_hash_t reserved32_2; + XXH64_hash_t totalLen; + XXH64_hash_t seed; + XXH64_hash_t reserved64; + /* note: there is some padding after due to alignment on 64 bytes */ + const unsigned char* secret; +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever possible. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + + +/* + * XXH3_64bits_reset(): + * Initialize with the default parameters. + * The result will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/* + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, and must outlive the hash streaming session, so + * be careful when using stack arrays. + * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); + + +/* 128-bit */ + +#ifdef XXH_NAMESPACE +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) + +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) + +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + +typedef struct { + XXH64_hash_t low64; + XXH64_hash_t high64; +} XXH128_hash_t; + +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); + + +/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * XXH128_cmp(): + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * return: >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 + */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be found in xxhash.c. + * + * However, code inlining requires the implementation to be visible to the + * compiler, usually within the header. + * + * As a workaround, xxhash.c used to be included within xxhash.h. This caused + * some issues with some build systems, especially ones which treat .c files + * as source files. + * + * Therefore, the implementation is now directly integrated within xxhash.h. + * Another small advantage is that xxhash.c is no longer needed in /include. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ +/*! + * XXH_FORCE_MEMORY_ACCESS: + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow to select a different access method for improved + * performance. + * Method 0 (default): + * Use `memcpy()`. Safe and portable. + * Method 1: + * `__attribute__((packed))` statement. It depends on compiler extensions + * and is therefore not portable. + * This method is safe if your compiler supports it, and *generally* as + * fast or faster than `memcpy`. + * Method 2: + * Direct access via cast. This method doesn't depend on the compiler but + * violates the C standard. + * It can generate buggy code on targets which do not support unaligned + * memory accesses. + * But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) + * Method 3: + * Byteshift. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. + * See https://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2 > 3) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*! + *XXH_ACCEPT_NULL_INPUT_POINTER: + * If the input pointer is NULL, xxHash's default behavior is to dereference it, + * triggering a segfault. + * When this macro is enabled, xxHash actively checks the input for a null pointer. + * If it is, the result for null input pointers is the same as a zero-length input. + */ +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*! + * XXH_FORCE_ALIGN_CHECK: + * This is a minor performance trick, only useful with lots of very small keys. + * It means: check for aligned/unaligned input. + * The check costs one initial branch per hash; + * Set it to 0 when the input is guaranteed to be aligned or when alignment + * doesn't matter for performance. + * + * This option does not affect XXH3. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*! + * XXH_NO_INLINE_HINTS: + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ +#ifndef XXH_NO_INLINE_HINTS +# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ + || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +/*! + * XXH_REROLL: + * Whether to reroll XXH32_finalize, and XXH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. + */ +#ifndef XXH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXH_REROLL 1 +# else +# define XXH_REROLL 0 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/*! + * Modify the local functions below should you wish to use some other memory + * routines for malloc() and free() + */ +#include + +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free(void* p) { free(p); } + +/*! and for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) \ + || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXH_NO_INLINE static __attribute__((noinline)) +# else +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +# endif +# else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* + * DEBUGLEVEL is expected to be defined externally, typically via the compiler's + * command line options. The value must be a number. + */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + + +/* *** Memory access *** */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianess *** */ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/*! + * XXH_CPU_LITTLE_ENDIAN: + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, a runtime check (which is usually constant folded) + * is used instead. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +static int XXH_isLittleEndian(void) +{ + /* + * Nonstandard, but well-defined behavior in practice. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ + && __has_builtin(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= PRIME32_1; +#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * This inline assembly hack forces acc into a normal register. This is the + * only thing that prevents GCC and Clang from autovectorizing the XXH32 + * loop (pragmas and attributes don't work for some resason) without globally + * disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * How this hack works: + * __asm__("" // Declare an assembly block but don't declare any instructions + * : // However, as an Input/Output Operand, + * "+r" // constrain a read/write operand (+) as a general purpose register (r). + * (acc) // and set acc as the operand + * ); + * + * Because of the 'r', the compiler has promised that seed will be in a + * general purpose register and the '+' says that it will be 'read/write', + * so it has to assume it has changed. It is like volatile without all the + * loads and stores. + * + * Since the argument has to be in a normal register (not an SSE register), + * each time XXH32_round is called, it is impossible to vectorize. + */ + __asm__("" : "+r" (acc)); +#endif + return acc; +} + +/* mix all bits */ +static xxh_u32 XXH32_avalanche(xxh_u32 h32) +{ + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +static xxh_u32 +XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1 do { \ + h32 += (*ptr++) * PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * PRIME32_1; \ +} while (0) + +#define PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * PRIME32_4; \ +} while (0) + + /* Compact rerolled version */ + if (XXH_REROLL) { + len &= 15; + while (len >= 4) { + PROCESS4; + len -= 4; + } + while (len > 0) { + PROCESS1; + --len; + } + return XXH32_avalanche(h32); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: PROCESS4; + /* fallthrough */ + case 8: PROCESS4; + /* fallthrough */ + case 4: PROCESS4; + return XXH32_avalanche(h32); + + case 13: PROCESS4; + /* fallthrough */ + case 9: PROCESS4; + /* fallthrough */ + case 5: PROCESS4; + PROCESS1; + return XXH32_avalanche(h32); + + case 14: PROCESS4; + /* fallthrough */ + case 10: PROCESS4; + /* fallthrough */ + case 6: PROCESS4; + PROCESS1; + PROCESS1; + return XXH32_avalanche(h32); + + case 15: PROCESS4; + /* fallthrough */ + case 11: PROCESS4; + /* fallthrough */ + case 7: PROCESS4; + /* fallthrough */ + case 3: PROCESS1; + /* fallthrough */ + case 2: PROCESS1; + /* fallthrough */ + case 1: PROCESS1; + /* fallthrough */ + case 0: return XXH32_avalanche(h32); + } + XXH_ASSERT(0); + return h32; /* reaching this point is deemed impossible */ + } +} + +XXH_FORCE_INLINE xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)16; + } +#endif + + if (len>=16) { + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; + xxh_u32 v2 = seed + PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + xxh_u32 v1 = state->v1; + xxh_u32 v2 = state->v2; + xxh_u32 v3 = state->v3; + xxh_u32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + + +/*! + * XXH_REROLL_XXH64: + * Whether to reroll the XXH64_finalize() loop. + * + * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a + * performance gain on 64-bit hosts, as only one jump is required. + * + * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit + * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial + * to unroll. The code becomes ridiculously large (the largest function in the + * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is + * also slightly faster because it fits into cache better and is more likely + * to be inlined by the compiler. + * + * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. + */ +#ifndef XXH_REROLL_XXH64 +# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ + || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ + || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ + || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ + || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ + || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ +# define XXH_REROLL_XXH64 1 +# else +# define XXH_REROLL_XXH64 0 +# endif +#endif /* !defined(XXH_REROLL_XXH64) */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static xxh_u64 XXH64_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +static xxh_u64 +XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define PROCESS1_64 do { \ + h64 ^= (*ptr++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; \ +} while (0) + +#define PROCESS4_64 do { \ + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ + ptr += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; \ +} while (0) + +#define PROCESS8_64 do { \ + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ + ptr += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ +} while (0) + + /* Rerolled version for 32-bit targets is faster and much smaller. */ + if (XXH_REROLL || XXH_REROLL_XXH64) { + len &= 31; + while (len >= 8) { + PROCESS8_64; + len -= 8; + } + if (len >= 4) { + PROCESS4_64; + len -= 4; + } + while (len > 0) { + PROCESS1_64; + --len; + } + return XXH64_avalanche(h64); + } else { + switch(len & 31) { + case 24: PROCESS8_64; + /* fallthrough */ + case 16: PROCESS8_64; + /* fallthrough */ + case 8: PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: PROCESS8_64; + /* fallthrough */ + case 20: PROCESS8_64; + /* fallthrough */ + case 12: PROCESS8_64; + /* fallthrough */ + case 4: PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: PROCESS8_64; + /* fallthrough */ + case 17: PROCESS8_64; + /* fallthrough */ + case 9: PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: PROCESS8_64; + /* fallthrough */ + case 21: PROCESS8_64; + /* fallthrough */ + case 13: PROCESS8_64; + /* fallthrough */ + case 5: PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: PROCESS8_64; + /* fallthrough */ + case 18: PROCESS8_64; + /* fallthrough */ + case 10: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: PROCESS8_64; + /* fallthrough */ + case 22: PROCESS8_64; + /* fallthrough */ + case 14: PROCESS8_64; + /* fallthrough */ + case 6: PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: PROCESS8_64; + /* fallthrough */ + case 19: PROCESS8_64; + /* fallthrough */ + case 11: PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: PROCESS8_64; + /* fallthrough */ + case 23: PROCESS8_64; + /* fallthrough */ + case 15: PROCESS8_64; + /* fallthrough */ + case 7: PROCESS4_64; + /* fallthrough */ + case 3: PROCESS1_64; + /* fallthrough */ + case 2: PROCESS1_64; + /* fallthrough */ + case 1: PROCESS1_64; + /* fallthrough */ + case 0: return XXH64_avalanche(h64); + } + } + /* impossible to reach */ + XXH_ASSERT(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +XXH_FORCE_INLINE xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input + len; + xxh_u64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)32; + } +#endif + + if (len>=32) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; + xxh_u64 v2 = seed + PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); + +#else + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved64, might be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = state->v1; + xxh_u64 v2 = state->v2; + xxh_u64 v3 = state->v3; + xxh_u64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + xxh_u64 const v1 = state->v1; + xxh_u64 const v2 = state->v2; + xxh_u64 const v3 = state->v3; + xxh_u64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} + + +/******* Canonical representation *******/ + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + + + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +#include "xxh3.h" + + +#endif /* XXH_NO_LONG_LONG */ + + +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} +#endif -- cgit v1.2.3 From 68d552074bf2c1989d96a8c28cc3f6fe1e6c8b8e Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 14:42:37 +0200 Subject: compile UMULLs and some fixes --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 33 +++++++++++++++++++++++++-------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 3 ++- 4 files changed, 30 insertions(+), 12 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 14c223b..43b94b6 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -301,10 +301,11 @@ void Compiler::A_Comp_MUL_MLA() Comp_MulOp(S, add, rd, rm, rs, rn); } -void Compiler::A_Comp_SMULL_SMLAL() +void Compiler::A_Comp_Mul_Long() { bool S = CurInstr.Instr & (1 << 20); bool add = CurInstr.Instr & (1 << 21); + bool sign = CurInstr.Instr & (1 << 22); OpArg rd = MapReg(CurInstr.A_Reg(16)); OpArg rm = MapReg(CurInstr.A_Reg(0)); OpArg rs = MapReg(CurInstr.A_Reg(8)); @@ -318,18 +319,34 @@ void Compiler::A_Comp_SMULL_SMLAL() MOV(32, R(RSCRATCH3), rs); TEST(32, R(RSCRATCH3), R(RSCRATCH3)); FixupBranch zeroBSR = J_CC(CC_Z); - BSR(32, RSCRATCH2, R(RSCRATCH3)); - NOT(32, R(RSCRATCH3)); - BSR(32, RSCRATCH, R(RSCRATCH3)); - CMP(32, R(RSCRATCH2), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + if (sign) + { + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + } + else + { + BSR(32, RSCRATCH, R(RSCRATCH3)); + } + SHR(32, R(RSCRATCH), Imm8(3)); SetJumpTarget(zeroBSR); // fortunately that's even right Comp_AddCycles_CI(RSCRATCH, 2); } - MOVSX(64, 32, RSCRATCH2, rm); - MOVSX(64, 32, RSCRATCH3, rs); + if (sign) + { + MOVSX(64, 32, RSCRATCH2, rm); + MOVSX(64, 32, RSCRATCH3, rs); + } + else + { + MOV(32, R(RSCRATCH2), rm); + MOV(32, R(RSCRATCH3), rs); + } if (add) { MOV(32, R(RSCRATCH), rd); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index be3709e..1b2d312 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -300,7 +300,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // CMN F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), // Mul - F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL, + F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff F(A_Comp_CLZ), NULL, NULL, NULL, NULL, // STR @@ -628,7 +628,7 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } else { - ConstantCycles += i + cycles; + ConstantCycles += cycles; SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); } } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index b428c33..a448b6d 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -89,7 +89,7 @@ public: void A_Comp_CmpOp(); void A_Comp_MUL_MLA(); - void A_Comp_SMULL_SMLAL(); + void A_Comp_Mul_Long(); void A_Comp_CLZ(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 4cafc1c..7f6fa53 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -423,7 +423,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (flags & memop_SubtractOffset) { - MOV(32, R(finalAddr), rnMapped); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); if (!offset.IsZero()) SUB(32, R(finalAddr), offset); } -- cgit v1.2.3 From a9dd6e30adc590e11e3a076c1245f1b0b48f27f6 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 19:35:40 +0200 Subject: implement msr and mrs for the x64 JIT --- src/ARMJIT.cpp | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 127 ++++++++++++++++++++++++++++++++++++- src/ARMJIT_x64/ARMJIT_Compiler.h | 3 + src/ARM_InstrInfo.cpp | 4 ++ 4 files changed, 134 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index cc8d4ce..46f71f1 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -824,7 +824,7 @@ void InvalidateITCM(u32 addr) void InvalidateAll() { - JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size()); for (auto it : JitBlocks) { JitBlock* block = it.second; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 1b2d312..52a16dc 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -38,6 +38,131 @@ const int RegisterCache::NativeRegsAvailable = #endif ; +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + OpArg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); + MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + CALL(ReadBanked); + MOV(32, rd, R(ABI_PARAM3)); + } + else + MOV(32, rd, R(RCPSR)); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + OpArg val = CurInstr.Instr & (1 << 25) + ? Imm32(ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))) + : MapReg(CurInstr.A_Reg(0)); + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); + MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + CALL(ReadBanked); + + MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00)); + MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF)); + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x10)); + CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE); + AND(32, R(RSCRATCH2), Imm32(mask)); + + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + NOT(32, R(RSCRATCH)); + AND(32, R(ABI_PARAM3), R(RSCRATCH)); + + AND(32, R(RSCRATCH2), val); + OR(32, R(ABI_PARAM3), R(RSCRATCH2)); + + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + CALL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + AND(32, R(RCPSR), Imm32(~mask)); + if (val.IsImm()) + { + MOV(32, R(RSCRATCH), val); + AND(32, R(RSCRATCH), Imm32(mask)); + OR(32, R(RCPSR), R(RSCRATCH)); + } + else + { + OR(32, R(RCPSR), Imm32(val.Imm32() & mask)); + } + } + else + { + MOV(32, R(RSCRATCH2), Imm32(mask)); + MOV(32, R(RSCRATCH3), R(RSCRATCH2)); + AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00)); + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x10)); + CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E); + + MOV(32, R(RSCRATCH3), R(RCPSR)); + + // I need you ANDN + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + NOT(32, R(RSCRATCH)); + AND(32, R(RCPSR), R(RSCRATCH)); + + AND(32, R(RSCRATCH2), val); + OR(32, R(RCPSR), R(RSCRATCH2)); + + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + if (Thumb || CurInstr.Cond() >= 0xE) + RegCache.Flush(); + else + { + // the ugly way... + // we only save them, to load and save them again + for (int reg : hiRegsLoaded) + SaveReg(reg, RegCache.Mapping[reg]); + } + + MOV(32, R(ABI_PARAM3), R(RCPSR)); + MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); + MOV(64, R(ABI_PARAM1), R(RCPU)); + CALL((void*)&ARM::UpdateMode); + + if (!Thumb && CurInstr.Cond() < 0xE) + { + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } + } + } +} + /* We'll repurpose this .bss memory @@ -328,7 +453,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // Branch F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), // system stuff - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL, F(Nop) }; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index a448b6d..2230eb8 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -100,6 +100,9 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void A_Comp_MRS(); + void A_Comp_MSR(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index b884773..28362d9 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -427,6 +427,10 @@ Info Decode(bool thumb, u32 num, u32 instr) res.Kind = ak_UNK; } } + if (res.Kind == ak_MRS && !(instr & (1 << 22))) + res.ReadFlags |= flag_N | flag_Z | flag_C | flag_V; + if ((res.Kind == ak_MSR_IMM || res.Kind == ak_MSR_REG) && instr & (1 << 19)) + res.WriteFlags |= flag_N | flag_Z | flag_C | flag_V; if (data & A_Read0) res.SrcRegs |= 1 << (instr & 0xF); -- cgit v1.2.3 From 59c8d3976562ec3ed057f21116b76a3a532bc4d1 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 26 Apr 2020 16:17:16 +0200 Subject: hopefully fix stack handling for linux --- src/ARMJIT_x64/ARMJIT_Linkage.s | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.s index dbbb024..0a84df0 100644 --- a/src/ARMJIT_x64/ARMJIT_Linkage.s +++ b/src/ARMJIT_x64/ARMJIT_Linkage.s @@ -44,6 +44,8 @@ ARM_Dispatch: #ifdef WIN64 sub rsp, 0x28 +#else + sub rsp, 0x8 #endif mov RCPU, ARG1_REG64 mov RCPSR, [RCPU + ARM_CPSR_offset] @@ -58,6 +60,8 @@ ARM_Ret: #ifdef WIN64 add rsp, 0x28 +#else + add rsp, 0x8 #endif pop rbp -- cgit v1.2.3 From b0b9ec42e42d491a90352aea040eb6ffb319cdf9 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 26 Apr 2020 20:47:36 +0200 Subject: don't use param registers for ReadBanked/WriteBanked should fix linux build --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 64 ++++++++++++++++++------------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 +++++----- 3 files changed, 40 insertions(+), 41 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 52a16dc..8d20425 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -48,10 +48,10 @@ void Compiler::A_Comp_MRS() { MOV(32, R(RSCRATCH), R(RCPSR)); AND(32, R(RSCRATCH), Imm8(0x1F)); - XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); - MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + XOR(32, R(RSCRATCH3), R(RSCRATCH3)); + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); CALL(ReadBanked); - MOV(32, rd, R(ABI_PARAM3)); + MOV(32, rd, R(RSCRATCH3)); } else MOV(32, rd, R(RCPSR)); @@ -75,28 +75,26 @@ void Compiler::A_Comp_MSR() { MOV(32, R(RSCRATCH), R(RCPSR)); AND(32, R(RSCRATCH), Imm8(0x1F)); - XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); - MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + XOR(32, R(RSCRATCH3), R(RSCRATCH3)); + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); CALL(ReadBanked); - MOV(32, R(RSCRATCH2), Imm32(0xFFFFFF00)); - MOV(32, R(RSCRATCH3), Imm32(0xFFFFFFFF)); + MOV(32, R(RSCRATCH2), Imm32(mask)); + MOV(32, R(RSCRATCH4), R(RSCRATCH2)); + AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00)); MOV(32, R(RSCRATCH), R(RCPSR)); AND(32, R(RSCRATCH), Imm8(0x1F)); CMP(32, R(RSCRATCH), Imm8(0x10)); - CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_NE); - AND(32, R(RSCRATCH2), Imm32(mask)); + CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - NOT(32, R(RSCRATCH)); - AND(32, R(ABI_PARAM3), R(RSCRATCH)); + MOV(32, R(RSCRATCH4), R(RSCRATCH2)); + NOT(32, R(RSCRATCH4)); + AND(32, R(RSCRATCH3), R(RSCRATCH4)); AND(32, R(RSCRATCH2), val); - OR(32, R(ABI_PARAM3), R(RSCRATCH2)); + OR(32, R(RSCRATCH3), R(RSCRATCH2)); - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - MOV(32, R(ABI_PARAM2), Imm32(15 - 8)); + MOV(32, R(RSCRATCH2), Imm32(15 - 8)); CALL(WriteBanked); } else @@ -219,13 +217,13 @@ Compiler::Compiler() { // RSCRATCH mode - // ABI_PARAM2 reg number - // ABI_PARAM3 value in current mode - // ret - ABI_PARAM3 + // RSCRATCH2 reg number + // RSCRATCH3 value in current mode + // ret - RSCRATCH3 ReadBanked = (void*)GetWritableCodePtr(); CMP(32, R(RSCRATCH), Imm8(0x11)); FixupBranch fiq = J_CC(CC_E); - SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + SUB(32, R(RSCRATCH2), Imm8(13 - 8)); FixupBranch notEverything = J_CC(CC_L); CMP(32, R(RSCRATCH), Imm8(0x12)); FixupBranch irq = J_CC(CC_E); @@ -239,30 +237,30 @@ Compiler::Compiler() RET(); SetJumpTarget(fiq); - MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ))); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ))); RET(); SetJumpTarget(irq); - MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ))); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ))); RET(); SetJumpTarget(svc); - MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC))); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC))); RET(); SetJumpTarget(abt); - MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT))); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT))); RET(); SetJumpTarget(und); - MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); + MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND))); RET(); } { // RSCRATCH mode - // ABI_PARAM2 reg n - // ABI_PARAM3 value + // RSCRATCH2 reg n + // RSCRATCH3 value // carry flag set if the register isn't banked WriteBanked = (void*)GetWritableCodePtr(); CMP(32, R(RSCRATCH), Imm8(0x11)); FixupBranch fiq = J_CC(CC_E); - SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + SUB(32, R(RSCRATCH2), Imm8(13 - 8)); FixupBranch notEverything = J_CC(CC_L); CMP(32, R(RSCRATCH), Imm8(0x12)); FixupBranch irq = J_CC(CC_E); @@ -277,23 +275,23 @@ Compiler::Compiler() RET(); SetJumpTarget(fiq); - MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3)); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3)); CLC(); RET(); SetJumpTarget(irq); - MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3)); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3)); CLC(); RET(); SetJumpTarget(svc); - MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3)); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3)); CLC(); RET(); SetJumpTarget(abt); - MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3)); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3)); CLC(); RET(); SetJumpTarget(und); - MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3)); + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3)); CLC(); RET(); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 2230eb8..e0a4978 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -16,6 +16,7 @@ const Gen::X64Reg RCPSR = Gen::R15; const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +const Gen::X64Reg RSCRATCH4 = Gen::R8; struct ComplexOperand { diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 7f6fa53..85a3737 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -540,14 +540,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc AND(32, R(RSCRATCH), Imm8(0x1F)); firstUserMode = false; } - MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); - POP(ABI_PARAM3); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + POP(RSCRATCH3); CALL(WriteBanked); FixupBranch sucessfulWritten = J_CC(CC_NC); if (RegCache.Mapping[reg] != INVALID_REG) - MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); + MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); else - SaveReg(reg, ABI_PARAM3); + SaveReg(reg, RSCRATCH3); SetJumpTarget(sucessfulWritten); } else if (RegCache.Mapping[reg] == INVALID_REG) @@ -600,12 +600,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc firstUserMode = false; } if (RegCache.Mapping[reg] == INVALID_REG) - LoadReg(reg, ABI_PARAM3); + LoadReg(reg, RSCRATCH3); else - MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg])); - MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); + MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); CALL(ReadBanked); - PUSH(ABI_PARAM3); + PUSH(RSCRATCH3); } else if (RegCache.Mapping[reg] == INVALID_REG) { -- cgit v1.2.3 From bcc4b5c8dda5ec91127808a525e2b7dbda41a4f3 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 26 Apr 2020 23:25:32 +0200 Subject: fix regression from last commit also a small mistake with msr --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 8d20425..dd20e3c 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -105,7 +105,7 @@ void Compiler::A_Comp_MSR() if ((mask & 0xFF) == 0) { AND(32, R(RCPSR), Imm32(~mask)); - if (val.IsImm()) + if (!val.IsImm()) { MOV(32, R(RSCRATCH), val); AND(32, R(RSCRATCH), Imm32(mask)); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 85a3737..b595e32 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -502,14 +502,6 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc int regsCount = regs.Count(); - if (decrement) - { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; - } - else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - s32 offset = (regsCount * 4) * (decrement ? -1 : 1); // we need to make sure that the stack stays aligned to 16 bytes @@ -519,6 +511,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { Comp_AddCycles_CDI(); + if (decrement) + { + MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); + preinc ^= true; + } + else + MOV(32, R(ABI_PARAM1), MapReg(rn)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -618,6 +618,14 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } } + if (decrement) + { + MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); + preinc ^= true; + } + else + MOV(32, R(ABI_PARAM1), MapReg(rn)); + MOV(64, R(ABI_PARAM2), R(RSP)); MOV(32, R(ABI_PARAM3), Imm32(regsCount)); -- cgit v1.2.3 From 0f53a34551d60964345debb1766f81ca4686eb17 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 9 May 2020 00:45:05 +0200 Subject: rewrite JIT memory emulation --- src/ARM.cpp | 10 +- src/ARM.h | 24 +- src/ARMJIT.cpp | 905 +++++++++++++++++++++++++--------- src/ARMJIT.h | 65 ++- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 4 +- src/ARMJIT_Internal.h | 68 ++- src/ARMJIT_RegisterCache.h | 18 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 34 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 935 +++++++++++++++++++----------------- src/ARM_InstrInfo.cpp | 16 +- src/CP15.cpp | 44 +- src/NDS.cpp | 105 +++- src/NDS.h | 8 + 14 files changed, 1465 insertions(+), 814 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 95d2b8b..205332d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -579,7 +579,8 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<0>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); + if (!translatedAddr) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); @@ -589,7 +590,7 @@ void ARMv5::ExecuteJIT() // hack so Cycles <= 0 becomes Cycles < 0 Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); if (block) ARM_Dispatch(this, block); else @@ -722,7 +723,8 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<1>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); + if (!translatedAddr) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); @@ -731,7 +733,7 @@ void ARMv4::ExecuteJIT() Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); if (block) ARM_Dispatch(this, block); else diff --git a/src/ARM.h b/src/ARM.h index 4877956..f64b7fe 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -308,7 +308,7 @@ public: void DataRead8(u32 addr, u32* val) { *val = NDS::ARM7Read8(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -317,7 +317,7 @@ public: addr &= ~1; *val = NDS::ARM7Read16(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -326,7 +326,7 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -341,7 +341,7 @@ public: void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -350,7 +350,7 @@ public: addr &= ~1; NDS::ARM7Write16(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -359,7 +359,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -390,7 +390,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) // mainRAM + if ((DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -417,7 +417,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) + if ((DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -443,4 +443,12 @@ void T_UNK(ARM* cpu); } +namespace NDS +{ + +extern ARMv5* ARM9; +extern ARMv4* ARM7; + +} + #endif // ARM_H diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 46f71f1..9602aed 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -23,6 +23,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "GPU.h" #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" @@ -34,9 +35,10 @@ namespace ARMJIT #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) -Compiler* compiler; +Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = { +const u32 ExeMemRegionSizes[] = +{ 0x8000, // Unmapped Region (dummy) 0x8000, // ITCM 4*1024*1024, // Main RAM @@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = { 0x40000 // ARM7 WVRAM }; -const u32 ExeMemRegionOffsets[] = { +const u32 ExeMemRegionOffsets[] = +{ 0, 0x8000, 0x10000, @@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = { 0x518000, }; -#define DUP2(x) x, x - -const static ExeMemKind JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(exeMem_ITCM), - /* 1X*/ DUP2(exeMem_ITCM), // mirror - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ DUP2(exeMem_SWRAM), - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ exeMem_Unmapped, - exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_ARM9_BIOS) - }, - //arm7 - { - /* 0X*/ DUP2(exeMem_ARM7_BIOS), - /* 1X*/ DUP2(exeMem_Unmapped), - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ exeMem_SWRAM, - exeMem_ARM7_WRAM, - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_Unmapped) - } -}; - -#undef DUP2 - /* translates address to pseudo physical address - more compact, eliminates mirroring, everything comes in a row - we only need one translation table */ -u32 AddrTranslate9[0x2000]; -u32 AddrTranslate7[0x4000]; + +u32 TranslateAddr9(u32 addr) +{ + switch (ClassifyAddress9(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM9: + if (NDS::SWRAM_ARM9) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); + else + return 0; + case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); + case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; + case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); + default: return 0; + } +} + +u32 TranslateAddr7(u32 addr) +{ + switch (ClassifyAddress7(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM7: + if (NDS::SWRAM_ARM7) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); + else + return 0; + case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; + case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); + case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); + default: return 0; + } +} AddressRange CodeRanges[ExeMemSpaceSize / 512]; -std::unordered_map JitBlocks; +TinyVector InvalidLiterals; + +std::unordered_map JitBlocks9; +std::unordered_map JitBlocks7; + +u8 MemoryStatus9[0x800000]; +u8 MemoryStatus7[0x800000]; + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + return memregion_SWRAM9; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7) + return memregion_SWRAM7; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void UpdateMemoryStatus9(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress9(addr); + u32 pseudoPhyisical = TranslateAddr9(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus9[i * 8 + j] = val; + } + } +} + +void UpdateMemoryStatus7(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress7(addr); + u32 pseudoPhyisical = TranslateAddr7(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus7[i * 8 + j] = val; + } + } +} + +void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) +{ + for (u32 i = 1; i < exeMem_Count; i++) + { + if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) + { + for (u32 num = 0; num < 2; num++) + { + u32 physSize = ExeMemRegionSizes[i]; + u32 mapSize = 0; + u32 mapStart = 0; + switch (i) + { + case exeMem_ITCM: + if (num == 0) + mapStart = 0; mapSize = NDS::ARM9->ITCMSize; + break; + case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; + case exeMem_SWRAM: + if (num == 0) + { + if (NDS::SWRAM_ARM9) + mapStart = 0x3000000, mapSize = 0x1000000; + else + mapStart = mapSize = 0; + } + else + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3000000, mapSize = 0x800000; + else + mapStart = mapSize = 0; + } + break; + case exeMem_LCDC: + if (num == 0) + mapStart = 0x6800000, mapSize = 0xA4000; + break; + case exeMem_ARM9_BIOS: + if (num == 0) + mapStart = 0xFFFF0000, mapSize = 0x10000; + break; + case exeMem_ARM7_BIOS: + if (num == 1) + mapStart = 0; mapSize = 0x4000; + break; + case exeMem_ARM7_WRAM: + if (num == 1) + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3800000, mapSize = 0x800000; + else + mapStart = 0x3000000, mapSize = 0x1000000; + } + break; + case exeMem_ARM7_WVRAM: + if (num == 1) + mapStart = 0x6000000, mapSize = 0x1000000; + break; + } + + for (u32 j = 0; j < mapSize / physSize; j++) + { + u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); + if (num == 0 + && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + if (invalidate) + { + if (num == 0) + MemoryStatus9[virtAddr / 512] |= 0x80; + else + MemoryStatus7[virtAddr / 512] |= 0x80; + } + else + { + if (num == 0) + MemoryStatus9[virtAddr / 512] &= ~0x80; + else + MemoryStatus7[virtAddr / 512] &= ~0x80; + } + } + + } + return; + } + } + + assert(false); +} + +template +T SlowRead9(ARMv5* cpu, u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (addr < cpu->ITCMSize) + val = *(T*)&cpu->ITCM[addr & 0x7FFF]; + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; + else if (std::is_same::value) + val = NDS::ARM9Read32(addr); + else if (std::is_same::value) + val = NDS::ARM9Read16(addr); + else + val = NDS::ARM9Read8(addr); + + if (std::is_same::value) + return ROR(val, offset << 3); + else + return val; +} + +template +void SlowWrite9(ARMv5* cpu, u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (addr < cpu->ITCMSize) + { + InvalidateITCMIfNecessary(addr); + *(T*)&cpu->ITCM[addr & 0x7FFF] = val; + } + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + { + *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; + } + else if (std::is_same::value) + { + NDS::ARM9Write32(addr, val); + } + else if (std::is_same::value) + { + NDS::ARM9Write16(addr, val); + } + else + { + NDS::ARM9Write8(addr, val); + } +} + +template void SlowWrite9(ARMv5*, u32, u32); +template void SlowWrite9(ARMv5*, u32, u16); +template void SlowWrite9(ARMv5*, u32, u8); + +template u32 SlowRead9(ARMv5*, u32); +template u16 SlowRead9(ARMv5*, u32); +template u8 SlowRead9(ARMv5*, u32); + +template +T SlowRead7(u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (std::is_same::value) + val = NDS::ARM7Read32(addr); + else if (std::is_same::value) + val = NDS::ARM7Read16(addr); + else + val = NDS::ARM7Read8(addr); + + if (std::is_same::value) + return ROR(val, offset << 3); + else + return val; +} + +template +void SlowWrite7(u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (std::is_same::value) + NDS::ARM7Write32(addr, val); + else if (std::is_same::value) + NDS::ARM7Write16(addr, val); + else + NDS::ARM7Write8(addr, val); +} + +template +void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite9(cpu, addr, data[i]); + else + data[i] = SlowRead9(cpu, addr); + addr += !PreInc * 4; + } +} + +template +void SlowBlockTransfer7(u32 addr, u64* data, u32 num) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite7(addr, data[i]); + else + data[i] = SlowRead7(addr); + addr += !PreInc * 4; + } +} + +template void SlowWrite7(u32, u32); +template void SlowWrite7(u32, u16); +template void SlowWrite7(u32, u8); + +template u32 SlowRead7(u32); +template u16 SlowRead7(u32); +template u8 SlowRead7(u32); + +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template struct UnreliableHashTable @@ -211,31 +540,25 @@ struct UnreliableHashTable }; UnreliableHashTable RestoreCandidates; -UnreliableHashTable FastBlockLookUp; +UnreliableHashTable FastBlockLookUp9; +UnreliableHashTable FastBlockLookUp7; void Init() { - for (int i = 0; i < 0x2000; i++) - { - ExeMemKind kind = JIT_MEM[0][i >> 8]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); - } - for (int i = 0; i < 0x4000; i++) - { - ExeMemKind kind = JIT_MEM[1][i >> 9]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); - } - - compiler = new Compiler(); + JITCompiler = new Compiler(); } void DeInit() { - delete compiler; + delete JITCompiler; +} + +void Reset() +{ + ResetBlockCache(); + + UpdateMemoryStatus9(0, 0xFFFFFFFF); + UpdateMemoryStatus7(0, 0xFFFFFFFF); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeLiteral(const FetchedInstr& instr, u32& addr) +bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) { - switch (instr.Info.Kind) + if (!thumb) { - case ARMInstrInfo::ak_STR_IMM: - case ARMInstrInfo::ak_STRB_IMM: - addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STRD_IMM: - case ARMInstrInfo::ak_STRH_IMM: - addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever} - addr = instr.Addr + 8; + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_LDR_IMM: + case ARMInstrInfo::ak_LDRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_LDRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + default: + break; + } + } + else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); return true; - default: - JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr); - return false; } + + JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); + return false; } bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, @@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F + +extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - if (!(cpu->Num == 0 - ? IsMapped<0>(blockAddr) - : IsMapped<1>(blockAddr))) + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr9(blockAddr) + : TranslateAddr7(blockAddr); + if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) { printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); } - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr<0>(blockAddr) - : TranslateAddr<1>(blockAddr); - FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; - u32 addresseRanges[32] = {}; + u32 addressRanges[Config::JIT_MaxBlockSize]; + u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; u32 numAddressRanges = 0; + u32 numLiterals = 0; + u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; + // they are going to be hashed + u32 literalValues[Config::JIT_MaxBlockSize]; + u32 instrValues[Config::JIT_MaxBlockSize]; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, - CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu) nextInstrAddr[1] = r15; JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); - u32 translatedAddr = (cpu->Num == 0 - ? TranslateAddr<0>(instrs[i].Addr) - : TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF; - if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + instrValues[i] = instrs[i].Instr; + + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(instrs[i].Addr) + : TranslateAddr7(instrs[i].Addr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { bool returning = false; for (int j = 0; j < numAddressRanges; j++) { - if (addresseRanges[j] == translatedAddr) + if (addressRanges[j] == translatedAddrRounded) { + std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); + std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); returning = true; break; } } if (!returning) - addresseRanges[numAddressRanges++] = translatedAddr; + addressRanges[numAddressRanges++] = translatedAddrRounded; } + addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); if (cpu->Num == 0) { @@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu) u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM - || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop + || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else @@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu) instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; - if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem - && instrs[i].Info.SrcRegs == (1 << 15) - && instrs[i].Info.DstRegs == 0) + u32 literalAddr; + if (Config::JIT_LiteralOptimisations + && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral + && DecodeLiteral(thumb, instrs[i], literalAddr)) { - assert (!thumb); - - u32 addr; - if (DecodeLiteral(instrs[i], addr)) - { - JIT_DEBUGPRINT("pc relative write detected\n"); - u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - ARMJIT::InvalidateByAddr(translatedAddr, false); - CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16)); - } + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(literalAddr) + : TranslateAddr7(literalAddr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + + u32 j = 0; + for (; j < numAddressRanges; j++) + if (addressRanges[j] == translatedAddrRounded) + break; + if (j == numAddressRanges) + addressRanges[numAddressRanges++] = translatedAddrRounded; + addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); + JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); + cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + literalLoadAddrs[numLiterals++] = translatedAddr; } if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 @@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu) else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr<0>(target) - : TranslateAddr<1>(target); + ? TranslateAddr9(target) + : TranslateAddr7(target); if (link) { @@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu) i++; - bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); + u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); + u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; if (prevBlock) { RestoreCandidates.Remove(pseudoPhysicalAddr); - if (prevBlock->NumInstrs == i) - { - for (int j = 0; j < i; j++) - { - if (prevBlock->Instrs()[j] != instrs[j].Instr) - { - mayRestore = false; - break; - } - } - } - else - mayRestore = false; - if (prevBlock->NumAddresses == numAddressRanges) + mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash; + + if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { for (int j = 0; j < numAddressRanges; j++) { - if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + if (prevBlock->AddressRanges()[j] != addressRanges[j] + || prevBlock->AddressMasks()[j] != addressMasks[j]) { mayRestore = false; break; @@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu) if (prevBlock) delete prevBlock; - block = new JitBlock(i, numAddressRanges); - for (int j = 0; j < i; j++) - block->Instrs()[j] = instrs[j].Instr; + block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); + block->LiteralHash = literalHash; + block->InstrHash = instrHash; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addressRanges[j]; for (int j = 0; j < numAddressRanges; j++) - block->AddressRanges()[j] = addresseRanges[j]; + block->AddressMasks()[j] = addressMasks[j]; + for (int j = 0; j < numLiterals; j++) + block->Literals()[j] = literalLoadAddrs[j]; - block->StartAddr = blockAddr; block->PseudoPhysicalAddr = pseudoPhysicalAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numAddressRanges; j++) { - assert(addresseRanges[j] == block->AddressRanges()[j]); - CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); + assert(addressRanges[j] == block->AddressRanges()[j]); + assert(addressMasks[j] == block->AddressMasks()[j]); + assert(addressMasks[j] != 0); + CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; + CodeRanges[addressRanges[j] / 512].Blocks.Add(block); + + UpdateRegionByPseudoPhyiscal(addressRanges[j], true); } - JitBlocks[pseudoPhysicalAddr] = block; - FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); + if (cpu->Num == 0) + { + JitBlocks9[pseudoPhysicalAddr] = block; + FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } + else + { + JitBlocks7[pseudoPhysicalAddr] = block; + FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } } -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) +void InvalidateByAddr(u32 pseudoPhysical) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - int startLength = range->Blocks.Length; - for (int i = 0; i < range->Blocks.Length; i++) + u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + + range->Code = 0; + for (int i = 0; i < range->Blocks.Length;) { - assert(range->Blocks.Length == startLength); JitBlock* block = range->Blocks[i]; + + bool invalidated = false; + u32 mask = 0; + for (int j = 0; j < block->NumAddresses; j++) + { + if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + { + mask = block->AddressMasks()[j]; + invalidated = block->AddressMasks()[j] & mask; + break; + } + } + assert(mask); + if (!invalidated) + { + range->Code |= mask; + i++; + continue; + } + range->Blocks.Remove(i); + + bool literalInvalidation = false; + for (int j = 0; j < block->NumLiterals; j++) + { + u32 addr = block->Literals()[j]; + if (addr == pseudoPhysical) + { + if (InvalidLiterals.Find(pseudoPhysical) != -1) + { + InvalidLiterals.Add(pseudoPhysical); + JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); + } + literalInvalidation = true; + break; + } + } for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); + + if (otherRange->Blocks.Length == 0) + { + otherRange->Code = 0; + UpdateRegionByPseudoPhyiscal(addr, false); + } } } for (int j = 0; j < block->NumLinks(); j++) - compiler->UnlinkBlock(block->Links()[j]); + JITCompiler->UnlinkBlock(block->Links()[j]); + block->ResetLinks(); - JitBlocks.erase(block->PseudoPhysicalAddr); - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + if (block->Num == 0) + { + JitBlocks9.erase(block->PseudoPhysicalAddr); + FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); + } + else + { + JitBlocks7.erase(block->PseudoPhysicalAddr); + FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); + } - if (mayRestore) + if (!literalInvalidation) { JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); if (prevBlock) delete prevBlock; } + else + { + delete block; + } } - if ((range->TimesInvalidated + 1) > range->TimesInvalidated) - range->TimesInvalidated++; - - range->Blocks.Clear(); -} -void InvalidateByAddr7(u32 addr) -{ - u32 pseudoPhysical = TranslateAddr<1>(addr); - if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false)) - InvalidateByAddr(pseudoPhysical); + if (range->Blocks.Length == 0) + UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); } -void InvalidateITCM(u32 addr) +void InvalidateRegionIfNecessary(u32 pseudoPhyisical) { - u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; - if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0) - InvalidateByAddr(pseudoPhysical); -} - -void InvalidateAll() -{ - JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size()); - for (auto it : JitBlocks) - { - JitBlock* block = it.second; - - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); - - for (int i = 0; i < block->NumAddresses; i++) - { - u32 addr = block->AddressRanges()[i]; - AddressRange* range = &CodeRanges[addr / 512]; - range->Blocks.Clear(); - if (range->TimesInvalidated + 1 > range->TimesInvalidated) - range->TimesInvalidated++; - } - for (int i = 0; i < block->NumLinks(); i++) - compiler->UnlinkBlock(block->Links()[i]); - block->ResetLinks(); - - JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); - if (prevBlock) - delete prevBlock; - } - - JitBlocks.clear(); + if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) + InvalidateByAddr(pseudoPhyisical); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - FastBlockLookUp.Reset(); + InvalidLiterals.Clear(); + FastBlockLookUp9.Reset(); + FastBlockLookUp7.Reset(); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -870,61 +1244,119 @@ void ResetBlockCache() RestoreCandidates.Table[i].ValB = NULL; } } - for (auto it : JitBlocks) + for (auto it : JitBlocks9) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].TimesInvalidated = 0; - CodeRanges[addr / 512].InvalidLiterals = 0; + CodeRanges[addr / 512].Code = 0; } delete block; } - JitBlocks.clear(); + for (auto it : JitBlocks7) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 512].Blocks.Clear(); + CodeRanges[addr / 512].Code = 0; + } + } + JitBlocks9.clear(); + JitBlocks7.clear(); - compiler->Reset(); + JITCompiler->Reset(); } +template JitBlockEntry LookUpBlockEntry(u32 addr) { - u32 entryOffset = FastBlockLookUp.LookUp(addr); + auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; + u32 entryOffset = fastMap.LookUp(addr); if (entryOffset != UINT32_MAX) - return compiler->AddEntryOffset(entryOffset); + return JITCompiler->AddEntryOffset(entryOffset); - auto block = JitBlocks.find(addr); - if (block != JitBlocks.end()) + auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; + auto block = slowMap.find(addr); + if (block != slowMap.end()) { - FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); return block->second->EntryPoint; } return NULL; } +template JitBlockEntry LookUpBlockEntry<0>(u32); +template JitBlockEntry LookUpBlockEntry<1>(u32); + template void LinkBlock(ARM* cpu, u32 codeOffset) { - u32 targetPseudoPhys = TranslateAddr(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); - auto block = JitBlocks.find(targetPseudoPhys); - if (block == JitBlocks.end()) + auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; + u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); + u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); + auto block = blockMap.find(targetPseudoPhys); + if (block == blockMap.end()) { CompileBlock(cpu); - block = JitBlocks.find(targetPseudoPhys); + block = blockMap.find(targetPseudoPhys); } JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); block->second->AddLink(codeOffset); - compiler->LinkBlock(codeOffset, block->second->EntryPoint); + JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + +template void LinkBlock<0>(ARM*, u32); +template void LinkBlock<1>(ARM*, u32); + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} +template +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } } void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) { - if ((addr & 0xFF000000) == 0x04000000) + switch (addr & 0xFF000000) { + case 0x04000000: if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) return (void*)NDSCart::ReadROMData; @@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) switch (size | store) { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; case 16: return (void*)NDS::ARM9IORead16; case 17: return (void*)NDS::ARM9IOWrite16; case 32: return (void*)NDS::ARM9IORead32; case 33: return (void*)NDS::ARM9IOWrite32; } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead; + case 9: return NULL; + case 16: return (void*)VRAMRead; + case 17: return (void*)VRAMWrite; + case 32: return (void*)VRAMRead; + case 33: return (void*)VRAMWrite; + } + break; } } else @@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } break; case 0x04800000: - if (addr < 0x04810000 && size == 16) + if (addr < 0x04810000 && size >= 16) { - if (store) - return (void*)Wifi::Write; - else - return (void*)Wifi::Read; + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } } break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7; + case 9: return (void*)GPU::WriteVRAM_ARM7; + case 16: return (void*)GPU::ReadVRAM_ARM7; + case 17: return (void*)GPU::WriteVRAM_ARM7; + case 32: return (void*)GPU::ReadVRAM_ARM7; + case 33: return (void*)GPU::WriteVRAM_ARM7; + } } } return NULL; } } - -template void ARMJIT::LinkBlock<0>(ARM*, u32); -template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index cab385f..44a6140 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -28,45 +28,60 @@ extern const u32 ExeMemRegionSizes[]; typedef u32 (*JitBlockEntry)(); -extern u32 AddrTranslate9[0x2000]; -extern u32 AddrTranslate7[0x4000]; - const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -template -inline bool IsMapped(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; -} - -template -inline u32 TranslateAddr(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); -} +u32 TranslateAddr9(u32 addr); +u32 TranslateAddr7(u32 addr); +template JitBlockEntry LookUpBlockEntry(u32 addr); - void Init(); void DeInit(); -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true); -void InvalidateAll(); +void Reset(); + +void InvalidateByAddr(u32 pseudoPhysical); + +void InvalidateRegionIfNecessary(u32 addr); -void InvalidateITCM(u32 addr); -void InvalidateByAddr7(u32 addr); +inline void InvalidateMainRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); +} +inline void InvalidateITCMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); +} +inline void InvalidateLCDCIfNecessary(u32 addr) +{ + if (addr < 0x68A3FFF) + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); +} +inline void InvalidateSWRAM7IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); +} +inline void InvalidateSWRAM9IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); +} +inline void InvalidateARM7WRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); +} +inline void InvalidateARM7WVRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); +} void CompileBlock(ARM* cpu); void ResetBlockCache(); +void UpdateMemoryStatus9(u32 start, u32 end); +void UpdateMemoryStatus7(u32 start, u32 end); + } extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 00fa436..a67f357 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; @@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) + if ((CurInstr.DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) cycles += numC + numD; diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 66d1808..4e45760 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -152,30 +152,34 @@ struct __attribute__((packed)) TinyVector class JitBlock { public: - JitBlock(u32 numInstrs, u32 numAddresses) + JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals) { - NumInstrs = numInstrs; + Num = num; NumAddresses = numAddresses; - Data.SetLength(numInstrs + numAddresses); + NumLiterals = numLiterals; + Data.SetLength(numAddresses * 2 + numLiterals); } - u32 StartAddr; u32 PseudoPhysicalAddr; - - u32 NumInstrs; - u32 NumAddresses; + + u32 InstrHash, LiteralHash; + u8 Num; + u16 NumAddresses; + u16 NumLiterals; JitBlockEntry EntryPoint; - u32* Instrs() - { return &Data[0]; } u32* AddressRanges() - { return &Data[NumInstrs]; } + { return &Data[0]; } + u32* AddressMasks() + { return &Data[NumAddresses]; } + u32* Literals() + { return &Data[NumAddresses * 2]; } u32* Links() - { return &Data[NumInstrs + NumAddresses]; } + { return &Data[NumAddresses * 2 + NumLiterals]; } u32 NumLinks() - { return Data.Length - NumInstrs - NumAddresses; } + { return Data.Length - NumAddresses * 2 - NumLiterals; } void AddLink(u32 link) { @@ -184,7 +188,7 @@ public: void ResetLinks() { - Data.SetLength(NumInstrs + NumAddresses); + Data.SetLength(NumAddresses * 2 + NumLiterals); } private: @@ -200,8 +204,7 @@ private: struct __attribute__((packed)) AddressRange { TinyVector Blocks; - u16 InvalidLiterals; - u16 TimesInvalidated; + u32 Code; }; extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; @@ -210,14 +213,45 @@ typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemRegion9[0x80000]; -extern u8 MemRegion7[0x80000]; +extern u8 MemoryStatus9[0x800000]; +extern u8 MemoryStatus7[0x800000]; + +extern TinyVector InvalidLiterals; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); template void LinkBlock(ARM* cpu, u32 codeOffset); +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM9, + memregion_SWRAM7, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +template T SlowRead9(ARMv5* cpu, u32 addr); +template void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template T SlowRead7(u32 addr); +template void SlowWrite7(u32 addr, T val); + +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 5e18e84..0547c84 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -95,20 +95,6 @@ public: LiteralsLoaded = 0; } - BitSet32 GetPushRegs() - { - BitSet16 used; - for (int i = 0; i < InstrsCount; i++) - used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs); - - BitSet32 res; - u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable); - for (int i = 0; i < registersMax; i++) - res |= BitSet32(1 << (int)NativeRegAllocOrder[i]); - - return res; - } - void Prepare(bool thumb, int i) { FetchedInstr instr = Instrs[i]; @@ -139,7 +125,6 @@ public: UnloadRegister(reg); u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; - u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -182,13 +167,12 @@ public: if (left-- == 0) break; - writeRegs |= (1 << reg) & instr.Info.DstRegs; LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); } } } - DirtyRegs |= writeRegs & ~(1 << 15); + DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index dd20e3c..eee2e0f 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -195,26 +195,6 @@ Compiler::Compiler() Reset(); - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - } - MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; - MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; - MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; - MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; - MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; - MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - for (int j = 0; j < 2; j++) - { - MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j); - MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false); - MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); - } - { // RSCRATCH mode // RSCRATCH2 reg number @@ -317,6 +297,12 @@ Compiler::Compiler() // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); + + NearStart = ResetStart; + FarStart = ResetStart + 1024*1024*24; + + NearSize = FarStart - ResetStart; + FarSize = (ResetStart + CodeMemSize) - FarStart; } void Compiler::LoadCPSR() @@ -504,6 +490,9 @@ void Compiler::Reset() { memset(ResetStart, 0xcc, CodeMemSize); SetCodePtr(ResetStart); + + NearCode = NearStart; + FarCode = FarStart; } void Compiler::Comp_SpecialBranchBehaviour(bool taken) @@ -544,8 +533,16 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... + if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... + { + printf("near reset\n"); + ResetBlockCache(); + } + if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess... + { + printf("far reset\n"); ResetBlockCache(); + } ConstantCycles = 0; Thumb = thumb; @@ -762,12 +759,14 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { + IrregularCycles = true; + s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index e0a4978..9df218b 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -140,7 +140,7 @@ public: }; void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); - void Comp_MemLoadLiteral(int size, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -154,12 +154,6 @@ public: void Comp_SpecialBranchBehaviour(bool taken); - void* Gen_MemoryRoutine9(bool store, int size); - - void* Gen_MemoryRoutineSeq9(bool store, bool preinc); - void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); - - void* Gen_ChangeCPSRRoutine(); Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -193,6 +187,26 @@ public: return (u8*)entry - ResetStart; } + void SwitchToNearCode() + { + FarCode = GetWritableCodePtr(); + SetCodePtr(NearCode); + } + + void SwitchToFarCode() + { + NearCode = GetWritableCodePtr(); + SetCodePtr(FarCode); + } + + u8* FarCode; + u8* NearCode; + u32 FarSize; + u32 NearSize; + + u8* NearStart; + u8* FarStart; + u8* ResetStart; u32 CodeMemSize; @@ -201,12 +215,6 @@ public: void* BranchStub[2]; - void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2]; - - void* MemoryFuncsSeq9[2][2]; - void* MemoryFuncsSeq7[2][2][2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b595e32..c13b779 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -25,236 +25,17 @@ int squeezePointer(T* ptr) improvement. */ -/* - address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) - store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) -*/ -void* Compiler::Gen_MemoryRoutine9(bool store, int size) +bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM9Write32, true); break; - case 16: JMP((u8*)NDS::ARM9Write16, true); break; - case 8: JMP((u8*)NDS::ARM9Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - // everything's already in the appropriate register - ABI_CallFunction(NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM9Read16, true); - } - else - JMP((u8*)NDS::ARM9Read8, true); - } - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); - if (store) - MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); - else - { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); + u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - SetJumpTarget(insideITCM); - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX - AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); - if (store) - { - MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - - // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! - static_assert(sizeof(AddressRange) == 16); - LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - JMP((u8*)InvalidateByAddr, true); - SetJumpTarget(noCode); - } - else + int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + if (invalidLiteralIdx != -1) { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - RET(); - - static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!"); - - return res; -} - -#define MEMORY_SEQ_WHILE_COND \ - if (!store) \ - MOV(32, currentElement, R(EAX));\ - if (!preinc) \ - ADD(32, R(ABI_PARAM1), Imm8(4)); \ - \ - SUB(32, R(ABI_PARAM3), Imm8(1)); \ - J_CC(CC_NZ, repeat); - -/* - ABI_PARAM1 address - ABI_PARAM2 address where registers are stored - ABI_PARAM3 how many values to read/write - - Dolphin x64CodeEmitter is my favourite assembler - */ -void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM9Write32); - } - else - CALL((void*)NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4)); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideITCM); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - - ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(ABI_PARAM4), R(RSCRATCH)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); - CALL((u8*)InvalidateByAddr); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - SetJumpTarget(noCode); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM7Write32); - } - else - CALL((void*)NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -#undef MEMORY_SEQ_WHILE_COND - -void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -276,12 +57,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) RegCache.PutLiteral(rd, val); Comp_AddCycles_CDI(); + + return true; } -/*void fault(u32 a, u32 b, u32 c, u32 d) -{ - printf("actually not static! %x %x %x %x\n", a, b, c, d); -}*/ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { @@ -291,17 +70,12 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - //bool check = false; if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, rd, addr); + + if (Comp_MemLoadLiteral(size, rd, addr)) return; - } } { @@ -314,173 +88,334 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz Comp_AddCycles_CDI(); } + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - OpArg rnMapped = MapReg(rn); - if (Thumb && rn == 15) - rnMapped = Imm32(R15 & ~0x2); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; - - void* memoryFunc = Num == 0 - ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] - : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (!addrIsStatic) { - u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - - /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); - MOV(32, R(ABI_PARAM1), Imm32(R15)); - MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); - CMP(32, R(RSCRATCH), Imm32(addr)); - FixupBranch eq = J_CC(CC_E); - CALL((void*)fault); - SetJumpTarget(eq);*/ - - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); + + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) { - ARMv5* cpu5 = (ARMv5*)CurCPU; + MOV(32, R(RSCRATCH3), rnMapped); - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - // disable this for now as DTCM is located in heap - // which might excced the RIP-addressable range - //region.Mem = cpu5->DTCM; - //region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } + finalAddr = rnMapped.GetSimpleReg(); } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); - if (region.Mem != NULL) + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + OpArg rm = MapReg(op2.Reg.Reg); - if (flags & memop_Store) + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) { - MOV(size, M(ptr), MapReg(rd)); + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); } else { - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - if (size == 32 && addr & ~0x3) + if (flags & memop_SubtractOffset) { - ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); } + else + MOV_sum(32, finalAddr, rnMapped, offset); } - - return; } - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memoryFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); } - X64Reg finalAddr = ABI_PARAM1; - if (flags & memop_Post) - { - MOV(32, R(ABI_PARAM1), rnMapped); + int expectedTarget = Num == 0 + ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); + if (CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store); - finalAddr = rnMapped.GetSimpleReg(); + switch (expectedTarget) + { + case memregion_MainRAM: + case memregion_DTCM: + case memregion_WRAM7: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_IO9: + case memregion_IO7: + case memregion_VWRAM: + compileFastPath = true; + break; + case memregion_Wifi: + compileFastPath = size >= 16; + break; + case memregion_VRAM: + compileFastPath = !(flags & memop_Store) || size >= 16; + case memregion_BIOS9: + compileFastPath = !(flags & memop_Store); + break; + default: break; } - if (op2.IsImm) + if (addrIsStatic && !compileFastPath) { - MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + compileFastPath = false; + compileSlowPath = true; } - else + + if (addrIsStatic && compileSlowPath) + MOV(32, R(RSCRATCH3), Imm32(staticAddress)); + + if (compileFastPath) { - OpArg rm = MapReg(op2.Reg.Reg); + FixupBranch slowPath; + if (compileSlowPath) + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SHR(32, R(RSCRATCH), Imm8(9)); + if (flags & memop_Store) + { + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); + } + else + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + + slowPath = J_CC(CC_NE, true); + } - if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() - && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7 + || expectedTarget == memregion_BIOS9) { - LEA(32, finalAddr, - MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + u8* data; + u32 mask; + if (expectedTarget == memregion_MainRAM) + { + data = NDS::MainRAM; + mask = MAIN_RAM_SIZE - 1; + } + else if (expectedTarget == memregion_BIOS9) + { + data = NDS::ARM9BIOS; + mask = 0xFFF; + } + else + { + data = NDS::ARM7WRAM; + mask = 0xFFFF; + } + OpArg memLoc; + if (addrIsStatic) + { + memLoc = M(data + ((staticAddress & mask & addressMask))); + } + else + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm32(mask & addressMask)); + memLoc = MDisp(RSCRATCH, squeezePointer(data)); + } + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - else + else if (expectedTarget == memregion_DTCM) + { + if (addrIsStatic) + MOV(32, R(RSCRATCH), Imm32(staticAddress)); + else + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); + OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); + } + else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7) { - bool throwAway; - OpArg offset = - Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - - if (flags & memop_SubtractOffset) + MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + if (addrIsStatic) { - if (R(finalAddr) != rnMapped) - MOV(32, R(finalAddr), rnMapped); - if (!offset.IsZero()) - SUB(32, R(finalAddr), offset); + MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask)); } else - MOV_sum(32, finalAddr, rnMapped, offset); + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm8(addressMask)); + } + AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - } + else + { + u32 maskedDataRegion; - if ((flags & memop_Writeback) && !(flags & memop_Post)) - MOV(32, rnMapped, R(finalAddr)); + if (addrIsStatic) + { + maskedDataRegion = staticAddress; + MOV(32, R(ABI_PARAM1), Imm32(staticAddress)); + } + else + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + AND(32, R(ABI_PARAM1), Imm8(addressMask)); - if (flags & memop_Store) - MOV(32, R(ABI_PARAM2), rdMapped); + maskedDataRegion = CurInstr.DataRegion; + if (Num == 0) + maskedDataRegion &= ~0xFFFFFF; + else + maskedDataRegion &= ~0x7FFFFF; + } - if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) - MOV(32, rdMapped, R(ABI_PARAM1)); + void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size); - if (inlinePreparation && size > 8) - AND(32, R(ABI_PARAM1), Imm8(addressMask)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); - CALL(memoryFunc); + ABI_CallFunction((void(*)())func); + } + else + { + if (!addrIsStatic) + MOV(32, rdMapped, R(RSCRATCH3)); - /*if (Num == 0 && check) - { - CMP(32, R(EAX), rdMapped); - FixupBranch notEqual = J_CC(CC_E); - ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0); - MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8))); - MOV(32, R(ABI_PARAM2), R(EAX)); - MOV(32, R(ABI_PARAM3), rdMapped); - MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr)); - CALL((u8*)fault); - ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0); - SetJumpTarget(notEqual); - }*/ - - if (!(flags & memop_Store)) - { - if (inlinePreparation && size == 32) + ABI_CallFunction((void(*)())func); + + if (!addrIsStatic) + MOV(32, R(RSCRATCH3), rdMapped); + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if ((size == 32 && !(flags & memop_Store))) { - if (constLocalROR32 == 4) + if (addrIsStatic) + { + if (staticAddress & 0x3) + ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8)); + } + else { - static_assert(RSCRATCH3 == ECX); - MOV(32, R(ECX), rdMapped); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); } - else if (constLocalROR32 != 0) - ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); } - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + if (compileSlowPath) + { + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + } + + if (compileSlowPath) + { + if (Num == 0) + { + MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); + MOV(64, R(ABI_PARAM1), R(RCPU)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite9); break; + case 16: CALL((void*)&SlowWrite9); break; + case 8: CALL((void*)&SlowWrite9); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead9); break; + case 16: CALL((void*)&SlowRead9); break; + case 8: CALL((void*)&SlowRead9); break; + } + } + } else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite7); break; + case 16: CALL((void*)&SlowWrite7); break; + case 8: CALL((void*)&SlowWrite7); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead7); break; + case 16: CALL((void*)&SlowRead7); break; + case 8: CALL((void*)&SlowRead7); break; + } + } + } + if (!(flags & memop_Store)) + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if (compileFastPath && compileSlowPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); } if (!(flags & memop_Store) && rd == 15) @@ -498,100 +433,160 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { - IrregularCycles = true; - int regsCount = regs.Count(); s32 offset = (regsCount * 4) * (decrement ? -1 : 1); // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8; +#else u32 stackAlloc = ((regsCount + 1) & ~1) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; - if (!store) + int expectedTarget = Num == 0 + ? ClassifyAddress9(CurInstr.DataRegion) + : ClassifyAddress7(CurInstr.DataRegion); + if (usermode || CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false; + + switch (expectedTarget) { + case memregion_DTCM: + case memregion_MainRAM: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_WRAM7: + compileFastPath = true; + break; + default: + break; + } + + if (!store) Comp_AddCycles_CDI(); + else + Comp_AddCycles_CD(); - if (decrement) + if (decrement) + { + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4)); + preinc ^= true; + } + else + MOV(32, R(RSCRATCH4), MapReg(rn)); + + if (compileFastPath) + { + assert(!usermode); + + MOV(32, R(RSCRATCH), R(RSCRATCH4)); + SHR(32, R(RSCRATCH), Imm8(9)); + + if (store) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); } else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - - MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); - MOV(64, R(ABI_PARAM2), R(RSP)); - - CALL(Num == 0 - ? MemoryFuncsSeq9[0][preinc] - : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + FixupBranch slowPath = J_CC(CC_NE, true); - bool firstUserMode = true; - for (int reg = 15; reg >= 0; reg--) + if (expectedTarget == memregion_DTCM) { - if (regs[reg]) + SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3)); + LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM))); + } + else if (expectedTarget == memregion_MainRAM) + { + AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM))); + } + else if (expectedTarget == memregion_WRAM7) + { + AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM))); + } + else // SWRAM + { + AND(32, R(RSCRATCH4), Imm8(~3)); + AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + } + u32 offset = 0; + for (int reg : regs) + { + if (preinc) + offset += 4; + OpArg mem = MDisp(RSCRATCH4, offset); + if (store) { - if (usermode && !regs[15] && reg >= 8 && reg < 15) + if (RegCache.LoadedRegs & (1 << reg)) { - if (firstUserMode) - { - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - firstUserMode = false; - } - MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - POP(RSCRATCH3); - CALL(WriteBanked); - FixupBranch sucessfulWritten = J_CC(CC_NC); - if (RegCache.Mapping[reg] != INVALID_REG) - MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); - else - SaveReg(reg, RSCRATCH3); - SetJumpTarget(sucessfulWritten); + MOV(32, mem, MapReg(reg)); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else { - assert(reg != 15); - - POP(RSCRATCH); - SaveReg(reg, RSCRATCH); + LoadReg(reg, RSCRATCH); + MOV(32, mem, R(RSCRATCH)); + } + } + else + { + if (RegCache.LoadedRegs & (1 << reg)) + { + MOV(32, MapReg(reg), mem); } else { - if (reg != 15) - RegCache.DirtyRegs |= (1 << reg); - POP(MapReg(reg).GetSimpleReg()); + MOV(32, R(RSCRATCH), mem); + SaveReg(reg, RSCRATCH); } } + if (!preinc) + offset += 4; } - if (regsCount & 1) - POP(RSCRATCH); + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + + if (!store) + { + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + if (allocOffset == 0) + MOV(64, R(ABI_PARAM2), R(RSP)); + else + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - if (regs[15]) + switch (Num * 2 | preinc) { - if (Num == 1) - { - if (Thumb) - OR(32, MapReg(15), Imm8(1)); - else - AND(32, MapReg(15), Imm8(0xFE)); - } - Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } - } - else - { - Comp_AddCycles_CD(); - if (regsCount & 1) - PUSH(RSCRATCH); + if (allocOffset) + ADD(64, R(RSP), Imm8(allocOffset)); bool firstUserMode = true; for (int reg : regs) { - if (usermode && reg >= 8 && reg < 15) + if (usermode && !regs[15] && reg >= 8 && reg < 15) { if (firstUserMode) { @@ -599,43 +594,107 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc AND(32, R(RSCRATCH), Imm8(0x1F)); firstUserMode = false; } - if (RegCache.Mapping[reg] == INVALID_REG) - LoadReg(reg, RSCRATCH3); - else - MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - CALL(ReadBanked); - PUSH(RSCRATCH3); + POP(RSCRATCH3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.LoadedRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); + else + SaveReg(reg, RSCRATCH3); + SetJumpTarget(sucessfulWritten); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else if (!(RegCache.LoadedRegs & (1 << reg))) { - LoadReg(reg, RSCRATCH); - PUSH(RSCRATCH); + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); } else { - PUSH(MapReg(reg).GetSimpleReg()); + POP(MapReg(reg).GetSimpleReg()); } } - - if (decrement) + } + else + { + bool firstUserMode = true; + for (int reg = 15; reg >= 0; reg--) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, RSCRATCH3); + else + MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(RSCRATCH3); + } + else if (!(RegCache.LoadedRegs & (1 << reg))) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + { + PUSH(MapReg(reg).GetSimpleReg()); + } + } } - else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - MOV(64, R(ABI_PARAM2), R(RSP)); + if (allocOffset) + SUB(64, R(RSP), Imm8(allocOffset)); + + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + if (allocOffset) + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + else + MOV(64, R(ABI_PARAM2), R(RSP)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - CALL(Num == 0 - ? MemoryFuncsSeq9[1][preinc] - : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); + switch (Num * 2 | preinc) + { + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; + } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); } + if (compileFastPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); + } + + if (!store && regs[15]) + { + if (Num == 1) + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + return offset; } @@ -786,9 +845,7 @@ void Compiler::T_Comp_LoadPCRel() { u32 offset = (CurInstr.Instr & 0xFF) << 2; u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); - else + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr)) Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 28362d9..b50e821 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -373,16 +373,16 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == tk_LDMIA || res.Kind == tk_POP) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); - res.NotStrictlyNeeded |= set; + u32 set = (instr & 0xFF); + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.DstRegs |= set; } if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + u32 set = (instr & 0xFF); if (res.Kind == tk_PUSH && instr & (1 << 8)) set |= (1 << 14); - res.NotStrictlyNeeded |= set; + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.SrcRegs |= set; } @@ -495,15 +495,15 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.DstRegs |= set; - res.NotStrictlyNeeded |= set; } if (res.Kind == ak_STM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.SrcRegs |= set; - res.NotStrictlyNeeded |= set; } if ((instr >> 28) < 0xE) diff --git a/src/CP15.cpp b/src/CP15.cpp index 62258e9..e665dbd 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -97,6 +97,10 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { +#ifdef JIT_ENABLED + u32 oldDTCMBase = DTCMBase; + u32 oldDTCMSize = DTCMSize; +#endif if (CP15Control & (1<<16)) { DTCMBase = DTCMSetting & 0xFFFFF000; @@ -109,10 +113,20 @@ void ARMv5::UpdateDTCMSetting() DTCMSize = 0; //printf("DTCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + { + ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); + ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + } +#endif } void ARMv5::UpdateITCMSetting() { +#ifdef JIT_ENABLED + u32 oldITCMSize = ITCMSize; +#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -123,6 +137,10 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldITCMSize != ITCMSize) + ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); +#endif } @@ -561,15 +579,9 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: -#ifdef JIT_ENABLED - ARMJIT::InvalidateAll(); -#endif ICacheInvalidateAll(); return; case 0x751: -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); -#endif ICacheInvalidateByAddr(val); return; case 0x752: @@ -732,7 +744,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) void ARMv5::DataRead8(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { @@ -753,7 +765,7 @@ void ARMv5::DataRead8(u32 addr, u32* val) void ARMv5::DataRead16(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -776,7 +788,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) void ARMv5::DataRead32(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -820,14 +832,14 @@ void ARMv5::DataRead32S(u32 addr, u32* val) void ARMv5::DataWrite8(u32 addr, u8 val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -844,7 +856,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) void ARMv5::DataWrite16(u32 addr, u16 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -853,7 +865,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -870,7 +882,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) void ARMv5::DataWrite32(u32 addr, u32 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -879,7 +891,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -903,7 +915,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 141c565..6e989a8 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -535,10 +535,6 @@ void Reset() KeyCnt = 0; RCnt = 0; -#ifdef JIT_ENABLED - ARMJIT::ResetBlockCache(); -#endif - NDSCart::Reset(); GBACart::Reset(); GPU::Reset(); @@ -548,6 +544,10 @@ void Reset() Wifi::Reset(); AREngine::Reset(); + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif } void Stop() @@ -1058,6 +1058,9 @@ void Halt() void MapSharedWRAM(u8 val) { + if (val == WRAMCnt) + return; + WRAMCnt = val; switch (WRAMCnt & 0x3) @@ -1090,6 +1093,11 @@ void MapSharedWRAM(u8 val) SWRAM_ARM7Mask = 0x7FFF; break; } + +#ifdef JIT_ENABLED + ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); + ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); +#endif } @@ -1873,12 +1881,18 @@ void ARM9Write8(u32 addr, u8 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1923,12 +1937,18 @@ void ARM9Write16(u32 addr, u16 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1949,7 +1969,12 @@ void ARM9Write16(u32 addr, u16 val) case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC(addr, val); + return; } case 0x07000000: @@ -1989,12 +2014,18 @@ void ARM9Write32(u32 addr, u32 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return ; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -2015,7 +2046,12 @@ void ARM9Write32(u32 addr, u32 val) case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC(addr, val); + return; } case 0x07000000: @@ -2279,30 +2315,38 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2312,6 +2356,9 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2342,30 +2389,38 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2383,6 +2438,9 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2415,30 +2473,38 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2457,6 +2523,9 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; diff --git a/src/NDS.h b/src/NDS.h index c7b455e..163260b 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -120,6 +120,14 @@ extern u8 ROMSeed1[2*8]; extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; +extern u8 SharedWRAM[0x8000]; +extern u8* SWRAM_ARM9; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM9Mask; +extern u32 SWRAM_ARM7Mask; + +extern u8 ARM7WRAM[0x10000]; + #define MAIN_RAM_SIZE 0x400000 extern u8 MainRAM[MAIN_RAM_SIZE]; -- cgit v1.2.3 From 5a0b568647ae3a0d501ca1b915745fe708c9519f Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 9 May 2020 14:34:52 +0200 Subject: allow allocating caller saved registers currently system-v only --- src/ARMJIT_x64/ARMJIT_Branch.cpp | 19 ++---------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 58 ++++++++++++++++++++++++++----------- src/ARMJIT_x64/ARMJIT_Compiler.h | 3 ++ src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 19 ++++++++++++ 4 files changed, 65 insertions(+), 34 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cac590a..27c24c7 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -138,18 +138,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) bool cpsrDirty = CPSRDirty; SaveCPSR(); - if (restoreCPSR) - { - if (Thumb || CurInstr.Cond() >= 0xE) - RegCache.Flush(); - else - { - // the ugly way... - // we only save them, to load and save them again - for (int reg : hiRegsLoaded) - SaveReg(reg, RegCache.Mapping[reg]); - } - } + PushRegs(restoreCPSR); MOV(64, R(ABI_PARAM1), R(RCPU)); MOV(32, R(ABI_PARAM2), R(addr)); @@ -162,11 +151,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) else CALL((void*)&ARMv4::JumpTo); - if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) - { - for (int reg : hiRegsLoaded) - LoadReg(reg, RegCache.Mapping[reg]); - } + PopRegs(restoreCPSR); LoadCPSR(); // in case this instruction is skipped diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index eee2e0f..ef04601 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -26,7 +26,8 @@ const X64Reg RegisterCache::NativeRegAllocOrder[] = #ifdef _WIN32 RBX, RSI, RDI, R12, R13, R14 #else - RBX, R12, R13, R14 // this is sad + RBX, R12, R13, R14, // callee saved, this is sad + R9, R10, R11, // caller saved #endif }; template <> @@ -34,10 +35,46 @@ const int RegisterCache::NativeRegsAvailable = #ifdef _WIN32 6 #else - 4 + 7 #endif ; +void Compiler::PushRegs(bool saveHiRegs) +{ + BitSet32 loadedRegs(RegCache.LoadedRegs); + + if (saveHiRegs) + { + BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + { + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.UnloadRegister(reg); + else + SaveReg(reg, RegCache.Mapping[reg]); + // prevent saving the register twice + loadedRegs[reg] = false; + } + } + + for (int reg : loadedRegs) + if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) + SaveReg(reg, RegCache.Mapping[reg]); +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + BitSet32 loadedRegs(RegCache.LoadedRegs); + for (int reg : loadedRegs) + { + if ((saveHiRegs && reg >= 8 && reg < 15) + || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED) + { + LoadReg(reg, RegCache.Mapping[reg]); + } + } +} + void Compiler::A_Comp_MRS() { Comp_AddCycles_C(); @@ -136,27 +173,14 @@ void Compiler::A_Comp_MSR() AND(32, R(RSCRATCH2), val); OR(32, R(RCPSR), R(RSCRATCH2)); - BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); - if (Thumb || CurInstr.Cond() >= 0xE) - RegCache.Flush(); - else - { - // the ugly way... - // we only save them, to load and save them again - for (int reg : hiRegsLoaded) - SaveReg(reg, RegCache.Mapping[reg]); - } + PushRegs(true); MOV(32, R(ABI_PARAM3), R(RCPSR)); MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); MOV(64, R(ABI_PARAM1), R(RCPU)); CALL((void*)&ARM::UpdateMode); - if (!Thumb && CurInstr.Cond() < 0xE) - { - for (int reg : hiRegsLoaded) - LoadReg(reg, RegCache.Mapping[reg]); - } + PopRegs(true); } } } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9df218b..f2fc301 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -168,6 +168,9 @@ public: Gen::FixupBranch CheckCondition(u32 cond); + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + Gen::OpArg MapReg(int reg) { if (reg == 15 && RegCache.Mapping[reg] == Gen::INVALID_REG) diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index c13b779..b27efdd 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -283,6 +283,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz } else { + PushRegs(false); + u32 maskedDataRegion; if (addrIsStatic) @@ -310,6 +312,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOV(32, R(ABI_PARAM2), rdMapped); ABI_CallFunction((void(*)())func); + + PopRegs(false); } else { @@ -318,6 +322,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ABI_CallFunction((void(*)())func); + PopRegs(false); + if (!addrIsStatic) MOV(32, R(RSCRATCH3), rdMapped); @@ -352,6 +358,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (compileSlowPath) { + PushRegs(false); + if (Num == 0) { MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); @@ -402,6 +410,9 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz } } } + + PopRegs(false); + if (!(flags & memop_Store)) { if (flags & memop_SignExtend) @@ -561,6 +572,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (!store) { + PushRegs(false); + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); @@ -580,6 +593,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc case 3: CALL((void*)&SlowBlockTransfer7); break; } + PopRegs(false); + if (allocOffset) ADD(64, R(RSP), Imm8(allocOffset)); @@ -655,6 +670,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (allocOffset) SUB(64, R(RSP), Imm8(allocOffset)); + PushRegs(false); + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); if (allocOffset) LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); @@ -674,6 +691,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + + PopRegs(false); } if (compileFastPath) -- cgit v1.2.3 From 4cff4b52286a7d1a7e40817d52a5d271a937ddc2 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 9 May 2020 15:39:39 +0200 Subject: allow allocating caller saved regs on windows --- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index ef04601..fd3fb70 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -24,7 +24,8 @@ template <> const X64Reg RegisterCache::NativeRegAllocOrder[] = { #ifdef _WIN32 - RBX, RSI, RDI, R12, R13, R14 + RBX, RSI, RDI, R12, R13, R14, // callee saved + R10, R11, // caller saved #else RBX, R12, R13, R14, // callee saved, this is sad R9, R10, R11, // caller saved @@ -33,7 +34,7 @@ const X64Reg RegisterCache::NativeRegAllocOrder[] = template <> const int RegisterCache::NativeRegsAvailable = #ifdef _WIN32 - 6 + 8 #else 7 #endif -- cgit v1.2.3 From fea9f95bba7475b2cd3b624a3ccd6cdee00a33f1 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 12 May 2020 16:09:20 +0200 Subject: fix inlined IO register access --- src/ARMJIT_x64/ARMJIT_Branch.cpp | 1 - src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 27c24c7..bda9e52 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -134,7 +134,6 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { IrregularCycles = true; - BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); bool cpsrDirty = CPSRDirty; SaveCPSR(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b27efdd..cf0bd23 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -283,8 +283,6 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz } else { - PushRegs(false); - u32 maskedDataRegion; if (addrIsStatic) @@ -309,6 +307,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (flags & memop_Store) { + PushRegs(false); + MOV(32, R(ABI_PARAM2), rdMapped); ABI_CallFunction((void(*)())func); @@ -320,6 +320,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (!addrIsStatic) MOV(32, rdMapped, R(RSCRATCH3)); + PushRegs(false); + ABI_CallFunction((void(*)())func); PopRegs(false); -- cgit v1.2.3 From e335a8ca7615c702cfa2dcdb71deb69468088fd8 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jun 2020 21:04:25 +0200 Subject: first steps in bringing over the JIT refactor/fastmem --- src/ARM.cpp | 43 +- src/ARM.h | 15 +- src/ARMJIT.cpp | 771 ++++++++++----------------------- src/ARMJIT.h | 64 +-- src/ARMJIT_A64/ARMJIT_ALU.cpp | 123 +++++- src/ARMJIT_A64/ARMJIT_Branch.cpp | 99 ++--- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 383 ++++++++++++----- src/ARMJIT_A64/ARMJIT_Compiler.h | 71 +++- src/ARMJIT_A64/ARMJIT_Linkage.s | 68 +++ src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 790 ++++++++++++++++------------------ src/ARMJIT_Compiler.h | 12 + src/ARMJIT_Internal.h | 70 +-- src/ARMJIT_Memory.cpp | 822 ++++++++++++++++++++++++++++++++++++ src/ARMJIT_Memory.h | 53 +++ src/ARMJIT_x64/ARMJIT_Compiler.cpp | 92 +--- src/ARMJIT_x64/ARMJIT_Compiler.h | 11 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 45 +- src/ARM_InstrInfo.cpp | 73 ++-- src/ARM_InstrInfo.h | 1 + src/CMakeLists.txt | 6 +- src/CP15.cpp | 84 ++-- src/Config.cpp | 6 +- src/Config.h | 1 + src/NDS.cpp | 220 +++++----- src/NDS.h | 17 +- 25 files changed, 2342 insertions(+), 1598 deletions(-) create mode 100644 src/ARMJIT_A64/ARMJIT_Linkage.s create mode 100644 src/ARMJIT_Compiler.h create mode 100644 src/ARMJIT_Memory.cpp create mode 100644 src/ARMJIT_Memory.h (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index 92a3a9e..e529be8 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,6 +21,8 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" +#include "ARMJIT.h" +#include "Config.h" #include "AREngine.h" #include "ARMJIT.h" #include "Config.h" @@ -74,7 +76,9 @@ ARM::~ARM() ARMv5::ARMv5() : ARM(0) { - // +#ifndef JIT_ENABLED + DTCM = new u8[DTCMSize]; +#endif } ARMv4::ARMv4() : ARM(1) @@ -82,6 +86,13 @@ ARMv4::ARMv4() : ARM(1) // } +ARMv5::~ARMv5() +{ +#ifndef JIT_ENABLED + delete[] DTCM; +#endif +} + void ARM::Reset() { Cycles = 0; @@ -622,24 +633,26 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); - if (!translatedAddr) + + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); return; } - // hack so Cycles <= 0 becomes Cycles < 0 - Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); + NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1; if (StopExecution) { @@ -766,23 +779,25 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); - if (!translatedAddr) + + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); + NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1; // TODO optimize this shit!!! if (StopExecution) diff --git a/src/ARM.h b/src/ARM.h index b1e8053..b7f16d6 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -32,11 +32,14 @@ enum RWFlags_ForceUser = (1<<21), }; +const u32 ITCMPhysicalSize = 0x8000; +const u32 DTCMPhysicalSize = 0x4000; + class ARM { public: ARM(u32 num); - ~ARM(); // destroy shit + virtual ~ARM(); // destroy shit virtual void Reset(); @@ -143,6 +146,11 @@ public: NDS::MemRegion CodeMem; +#ifdef JIT_ENABLED + u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0; + u64* FastBlockLookup; +#endif + static u32 ConditionTable[16]; protected: @@ -158,6 +166,7 @@ class ARMv5 : public ARM { public: ARMv5(); + ~ARMv5(); void Reset(); @@ -260,8 +269,8 @@ public: u32 DTCMBase, DTCMSize; s32 RegionCodeCycles; - u8 ITCM[0x8000]; - u8 DTCM[0x4000]; + u8 ITCM[ITCMPhysicalSize]; + u8* DTCM; u8 ICache[0x2000]; u32 ICacheTags[64*4]; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 8d87c76..53b28c1 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -10,13 +10,8 @@ #include "Config.h" #include "ARMJIT_Internal.h" -#if defined(__x86_64__) -#include "ARMJIT_x64/ARMJIT_Compiler.h" -#elif defined(__aarch64__) -#include "ARMJIT_A64/ARMJIT_Compiler.h" -#else -#error "The current target platform doesn't have a JIT backend" -#endif +#include "ARMJIT_Memory.h" +#include "ARMJIT_Compiler.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" @@ -29,6 +24,11 @@ #include "Wifi.h" #include "NDSCart.h" +#include "ARMJIT_x64/ARMJIT_Offsets.h" +static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset); +static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset); +static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset); + namespace ARMJIT { @@ -37,281 +37,100 @@ namespace ARMJIT Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = -{ - 0x8000, // Unmapped Region (dummy) - 0x8000, // ITCM - 4*1024*1024, // Main RAM - 0x8000, // SWRAM - 0xA4000, // LCDC - 0x8000, // ARM9 BIOS - 0x4000, // ARM7 BIOS - 0x10000, // ARM7 WRAM - 0x40000 // ARM7 WVRAM -}; - -const u32 ExeMemRegionOffsets[] = -{ - 0, - 0x8000, - 0x10000, - 0x410000, - 0x418000, - 0x4BC000, - 0x4C4000, - 0x4C8000, - 0x4D8000, - 0x518000, -}; - -/* - translates address to pseudo physical address - - more compact, eliminates mirroring, everything comes in a row - - we only need one translation table -*/ - -u32 TranslateAddr9(u32 addr) -{ - switch (ClassifyAddress9(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM9: - if (NDS::SWRAM_ARM9) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); - else - return 0; - case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); - case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; - case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); - default: return 0; - } -} - -u32 TranslateAddr7(u32 addr) -{ - switch (ClassifyAddress7(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM7: - if (NDS::SWRAM_ARM7) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); - else - return 0; - case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; - case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); - case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); - default: return 0; - } -} - -AddressRange CodeRanges[ExeMemSpaceSize / 512]; - -TinyVector InvalidLiterals; +AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; +AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; +AddressRange CodeIndexVRAM[0x100000 / 512]; +AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; +AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; +AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; +AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; std::unordered_map JitBlocks9; std::unordered_map JitBlocks7; -u8 MemoryStatus9[0x800000]; -u8 MemoryStatus7[0x800000]; +u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; +u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; +u64 FastBlockLookupVRAM[0x100000 / 2]; +u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; +u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; +u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; +u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; -int ClassifyAddress9(u32 addr) +const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { - if (addr < NDS::ARM9->ITCMSize) - return memregion_ITCM; - else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - return memregion_DTCM; - else if ((addr & 0xFFFFF000) == 0xFFFF0000) - return memregion_BIOS9; - else - { - switch (addr & 0xFF000000) - { - case 0x02000000: - return memregion_MainRAM; - case 0x03000000: - return memregion_SWRAM9; - case 0x04000000: - return memregion_IO9; - case 0x06000000: - return memregion_VRAM; - } - } - return memregion_Other; -} + 0, + ITCMPhysicalSize, + 0, + sizeof(NDS::ARM9BIOS), + NDS::MainRAMSize, + NDS::SharedWRAMSize, + 0, + 0x100000, + sizeof(NDS::ARM7BIOS), + NDS::ARM7WRAMSize, + 0, + 0, + 0x40000, +}; -int ClassifyAddress7(u32 addr) +AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = { - if (addr < 0x00004000) - return memregion_BIOS7; - else - { - switch (addr & 0xFF800000) - { - case 0x02000000: - case 0x02800000: - return memregion_MainRAM; - case 0x03000000: - if (NDS::SWRAM_ARM7) - return memregion_SWRAM7; - else - return memregion_WRAM7; - case 0x03800000: - return memregion_WRAM7; - case 0x04000000: - return memregion_IO7; - case 0x04800000: - return memregion_Wifi; - case 0x06000000: - case 0x06800000: - return memregion_VWRAM; - } - } - return memregion_Other; -} + NULL, + CodeIndexITCM, + NULL, + CodeIndexARM9BIOS, + CodeIndexMainRAM, + CodeIndexSWRAM, + NULL, + CodeIndexVRAM, + CodeIndexARM7BIOS, + CodeIndexARM7WRAM, + NULL, + NULL, + CodeIndexARM7WVRAM, +}; -void UpdateMemoryStatus9(u32 start, u32 end) +u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) - { - u32 addr = i << 12; - - int region = ClassifyAddress9(addr); - u32 pseudoPhyisical = TranslateAddr9(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus9[i * 8 + j] = val; - } - } -} + NULL, + FastBlockLookupITCM, + NULL, + FastBlockLookupARM9BIOS, + FastBlockLookupMainRAM, + FastBlockLookupSWRAM, + NULL, + FastBlockLookupVRAM, + FastBlockLookupARM7BIOS, + FastBlockLookupARM7WRAM, + NULL, + NULL, + FastBlockLookupARM7WVRAM +}; -void UpdateMemoryStatus7(u32 start, u32 end) +u32 LocaliseCodeAddress(u32 num, u32 addr) { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addr) + : ARMJIT_Memory::ClassifyAddress7(addr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize) + && CodeMemRegions[region]) { - u32 addr = i << 12; - - int region = ClassifyAddress7(addr); - u32 pseudoPhyisical = TranslateAddr7(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus7[i * 8 + j] = val; - } + addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + addr |= (u32)region << 28; + return addr; } + return 0; } -void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) -{ - for (u32 i = 1; i < exeMem_Count; i++) - { - if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) - { - for (u32 num = 0; num < 2; num++) - { - u32 physSize = ExeMemRegionSizes[i]; - u32 mapSize = 0; - u32 mapStart = 0; - switch (i) - { - case exeMem_ITCM: - if (num == 0) - mapStart = 0; mapSize = NDS::ARM9->ITCMSize; - break; - case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; - case exeMem_SWRAM: - if (num == 0) - { - if (NDS::SWRAM_ARM9) - mapStart = 0x3000000, mapSize = 0x1000000; - else - mapStart = mapSize = 0; - } - else - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3000000, mapSize = 0x800000; - else - mapStart = mapSize = 0; - } - break; - case exeMem_LCDC: - if (num == 0) - mapStart = 0x6800000, mapSize = 0xA4000; - break; - case exeMem_ARM9_BIOS: - if (num == 0) - mapStart = 0xFFFF0000, mapSize = 0x10000; - break; - case exeMem_ARM7_BIOS: - if (num == 1) - mapStart = 0; mapSize = 0x4000; - break; - case exeMem_ARM7_WRAM: - if (num == 1) - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3800000, mapSize = 0x800000; - else - mapStart = 0x3000000, mapSize = 0x1000000; - } - break; - case exeMem_ARM7_WVRAM: - if (num == 1) - mapStart = 0x6000000, mapSize = 0x1000000; - break; - } - - for (u32 j = 0; j < mapSize / physSize; j++) - { - u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); - if (num == 0 - && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - continue; - if (invalidate) - { - if (num == 0) - MemoryStatus9[virtAddr / 512] |= 0x80; - else - MemoryStatus7[virtAddr / 512] |= 0x80; - } - else - { - if (num == 0) - MemoryStatus9[virtAddr / 512] &= ~0x80; - else - MemoryStatus7[virtAddr / 512] &= ~0x80; - } - } - - } - return; - } - } - - assert(false); -} +TinyVector InvalidLiterals; template -T SlowRead9(ARMv5* cpu, u32 addr) +T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; addr &= ~(sizeof(T) - 1); @@ -335,13 +154,13 @@ T SlowRead9(ARMv5* cpu, u32 addr) } template -void SlowWrite9(ARMv5* cpu, u32 addr, T val) +void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); if (addr < cpu->ITCMSize) { - InvalidateITCMIfNecessary(addr); + CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); *(T*)&cpu->ITCM[addr & 0x7FFF] = val; } else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) @@ -362,13 +181,13 @@ void SlowWrite9(ARMv5* cpu, u32 addr, T val) } } -template void SlowWrite9(ARMv5*, u32, u32); -template void SlowWrite9(ARMv5*, u32, u16); -template void SlowWrite9(ARMv5*, u32, u8); +template void SlowWrite9(u32, ARMv5*, u32); +template void SlowWrite9(u32, ARMv5*, u16); +template void SlowWrite9(u32, ARMv5*, u8); -template u32 SlowRead9(ARMv5*, u32); -template u16 SlowRead9(ARMv5*, u32); -template u8 SlowRead9(ARMv5*, u32); +template u32 SlowRead9(u32, ARMv5*); +template u16 SlowRead9(u32, ARMv5*); +template u8 SlowRead9(u32, ARMv5*); template T SlowRead7(u32 addr) @@ -407,14 +226,15 @@ template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) - SlowWrite9(cpu, addr, data[i]); + SlowWrite9(addr, cpu, data[i]); else - data[i] = SlowRead9(cpu, addr); - addr += !PreInc * 4; + data[i] = SlowRead9(addr, cpu); + addr += 4; } } @@ -422,14 +242,15 @@ template void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) SlowWrite7(addr, data[i]); else data[i] = SlowRead7(addr); - addr += !PreInc * 4; + addr += 4; } } @@ -540,16 +361,18 @@ struct UnreliableHashTable }; UnreliableHashTable RestoreCandidates; -UnreliableHashTable FastBlockLookUp9; -UnreliableHashTable FastBlockLookUp7; void Init() { JITCompiler = new Compiler(); + + ARMJIT_Memory::Init(); } void DeInit() { + ARMJIT_Memory::DeInit(); + delete JITCompiler; } @@ -557,8 +380,7 @@ void Reset() { ResetBlockCache(); - UpdateMemoryStatus9(0, 0xFFFFFFFF); - UpdateMemoryStatus7(0, 0xFFFFFFFF); + ARMJIT_Memory::Reset(); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -673,11 +495,12 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) // it basically checks if one iteration of a loop depends on another // the rules are quite simple + JIT_DEBUGPRINT("checking potential idle loop\n"); u16 regsWrittenTo = 0; u16 regsDisallowedToWrite = 0; for (int i = 0; i < instrsCount; i++) { - //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) return false; if (i < instrsCount - 1 && instrs[i].Info.Branches()) @@ -782,8 +605,6 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F - -extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -794,14 +615,28 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr9(blockAddr) - : TranslateAddr7(blockAddr); - if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) - { - printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); - } - + + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; + auto existingBlockIt = map.find(blockAddr); + if (existingBlockIt != map.end()) + { + // there's already a block, though it's not inside the fast map + // could be that there are two blocks at the same physical addr + // but different mirrors + u32 localAddr = existingBlockIt->second->StartAddrLocal; + + u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; @@ -842,9 +677,8 @@ void CompileBlock(ARM* cpu) instrValues[i] = instrs[i].Instr; - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(instrs[i].Addr) - : TranslateAddr7(instrs[i].Addr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); + assert(translatedAddr); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { @@ -928,9 +762,11 @@ void CompileBlock(ARM* cpu) && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral && DecodeLiteral(thumb, instrs[i], literalAddr)) { - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(literalAddr) - : TranslateAddr7(literalAddr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr); + if (!translatedAddr) + { + printf("literal in non executable memory?\n"); + } u32 translatedAddrRounded = translatedAddr & ~0x1FF; u32 j = 0; @@ -994,9 +830,7 @@ void CompileBlock(ARM* cpu) } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { - u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr9(target) - : TranslateAddr7(target); + u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); if (link) { @@ -1048,7 +882,7 @@ void CompileBlock(ARM* cpu) { RestoreCandidates.Remove(instrHash); - mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash; + mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash; if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { @@ -1087,11 +921,12 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numLiterals; j++) block->Literals()[j] = literalLoadAddrs[j]; - block->PseudoPhysicalAddr = pseudoPhysicalAddr; + block->StartAddr = blockAddr; + block->StartAddrLocal = localAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); } else { @@ -1104,30 +939,34 @@ void CompileBlock(ARM* cpu) assert(addressRanges[j] == block->AddressRanges()[j]); assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); - CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; - CodeRanges[addressRanges[j] / 512].Blocks.Add(block); - UpdateRegionByPseudoPhyiscal(addressRanges[j], true); + AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; + + if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); + + AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; + range->Code |= addressMasks[j]; + range->Blocks.Add(block); } if (cpu->Num == 0) - { - JitBlocks9[pseudoPhysicalAddr] = block; - FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks9[blockAddr] = block; else - { - JitBlocks7[pseudoPhysicalAddr] = block; - FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks7[blockAddr] = block; + + u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } -void InvalidateByAddr(u32 pseudoPhysical) +void InvalidateByAddr(u32 localAddr) { - JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); - AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + AddressRange* region = CodeMemRegions[localAddr >> 28]; + AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; + u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; for (int i = 0; i < range->Blocks.Length;) @@ -1138,7 +977,7 @@ void InvalidateByAddr(u32 pseudoPhysical) u32 mask = 0; for (int j = 0; j < block->NumAddresses; j++) { - if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + if (block->AddressRanges()[j] == (localAddr & ~0x1FF)) { mask = block->AddressMasks()[j]; invalidated = block->AddressMasks()[j] & mask; @@ -1154,15 +993,21 @@ void InvalidateByAddr(u32 pseudoPhysical) } range->Blocks.Remove(i); + if (range->Blocks.Length == 0 + && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) + { + ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); + } + bool literalInvalidation = false; for (int j = 0; j < block->NumLiterals; j++) { u32 addr = block->Literals()[j]; - if (addr == pseudoPhysical) + if (addr == localAddr) { - if (InvalidLiterals.Find(pseudoPhysical) != -1) + if (InvalidLiterals.Find(localAddr) != -1) { - InvalidLiterals.Add(pseudoPhysical); + InvalidLiterals.Add(localAddr); JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); } literalInvalidation = true; @@ -1172,35 +1017,30 @@ void InvalidateByAddr(u32 pseudoPhysical) for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - if ((addr / 512) != (pseudoPhysical / 512)) + if ((addr / 512) != (localAddr / 512)) { - AddressRange* otherRange = &CodeRanges[addr / 512]; + AddressRange* otherRegion = CodeMemRegions[addr >> 28]; + AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; assert(otherRange != range); + bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); if (otherRange->Blocks.Length == 0) { + if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); + otherRange->Code = 0; - UpdateRegionByPseudoPhyiscal(addr, false); } } } - for (int j = 0; j < block->NumLinks(); j++) - JITCompiler->UnlinkBlock(block->Links()[j]); - block->ResetLinks(); - + FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) - { - JitBlocks9.erase(block->PseudoPhysicalAddr); - FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); - } + JitBlocks9.erase(block->StartAddr); else - { - JitBlocks7.erase(block->PseudoPhysicalAddr); - FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); - } + JitBlocks7.erase(block->StartAddr); if (!literalInvalidation) { @@ -1213,24 +1053,66 @@ void InvalidateByAddr(u32 pseudoPhysical) delete block; } } +} - if (range->Blocks.Length == 0) - UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); +template +void CheckAndInvalidate(u32 addr) +{ + // let's hope this gets all properly inlined + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) + { + u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr | (region << 28)); + } +} + +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) +{ + u64* entry = &entries[offset / 2]; + if (*entry >> 32 == (addr | num)) + return JITCompiler->AddEntryOffset((u32)*entry); + return NULL; } -void InvalidateRegionIfNecessary(u32 pseudoPhyisical) +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { - if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) - InvalidateByAddr(pseudoPhyisical); + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(blockAddr) + : ARMJIT_Memory::ClassifyAddress7(blockAddr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (CodeMemRegions[region] + && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize)) + { + entry = FastBlockLookupRegions[region] + memoryOffset / 2; + // evil, though it should work for everything except DTCM which is not relevant here + start = blockAddr & ~(memorySize - 1); + size = memorySize; + return true; + } + else + return false; } +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); + void ResetBlockCache() { printf("Resetting JIT block cache...\n"); InvalidLiterals.Clear(); - FastBlockLookUp9.Reset(); - FastBlockLookUp7.Reset(); + for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++) + memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -1251,8 +1133,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } delete block; } @@ -1262,8 +1145,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } } JitBlocks9.clear(); @@ -1272,191 +1156,4 @@ void ResetBlockCache() JITCompiler->Reset(); } -template -JitBlockEntry LookUpBlockEntry(u32 addr) -{ - auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; - u32 entryOffset = fastMap.LookUp(addr); - if (entryOffset != UINT32_MAX) - return JITCompiler->AddEntryOffset(entryOffset); - - auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; - auto block = slowMap.find(addr); - if (block != slowMap.end()) - { - fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); - return block->second->EntryPoint; - } - return NULL; -} - -template JitBlockEntry LookUpBlockEntry<0>(u32); -template JitBlockEntry LookUpBlockEntry<1>(u32); - -template -void LinkBlock(ARM* cpu, u32 codeOffset) -{ - auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; - u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); - u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); - auto block = blockMap.find(targetPseudoPhys); - if (block == blockMap.end()) - { - CompileBlock(cpu); - block = blockMap.find(targetPseudoPhys); - } - - JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); - - block->second->AddLink(codeOffset); - JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); -} - -template void LinkBlock<0>(ARM*, u32); -template void LinkBlock<1>(ARM*, u32); - -void WifiWrite32(u32 addr, u32 val) -{ - Wifi::Write(addr, val & 0xFFFF); - Wifi::Write(addr + 2, val >> 16); -} - -u32 WifiRead32(u32 addr) -{ - return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); -} - -template -void VRAMWrite(u32 addr, T val) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; - case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; - case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; - case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; - } -} -template -T VRAMRead(u32 addr) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: return GPU::ReadVRAM_ABG(addr); - case 0x00200000: return GPU::ReadVRAM_BBG(addr); - case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); - case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); - default: return GPU::ReadVRAM_LCDC(addr); - } -} - -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) -{ - if (cpu->Num == 0) - { - switch (addr & 0xFF000000) - { - case 0x04000000: - if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) - return (void*)NDSCart::ReadROMData; - - /* - unfortunately we can't map GPU2D this way - since it's hidden inside an object - - though GPU3D registers are accessed much more intensive - */ - if (addr >= 0x04000320 && addr < 0x040006A4) - { - switch (size | store) - { - case 8: return (void*)GPU3D::Read8; - case 9: return (void*)GPU3D::Write8; - case 16: return (void*)GPU3D::Read16; - case 17: return (void*)GPU3D::Write16; - case 32: return (void*)GPU3D::Read32; - case 33: return (void*)GPU3D::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; - case 16: return (void*)NDS::ARM9IORead16; - case 17: return (void*)NDS::ARM9IOWrite16; - case 32: return (void*)NDS::ARM9IORead32; - case 33: return (void*)NDS::ARM9IOWrite32; - } - break; - case 0x06000000: - switch (size | store) - { - case 8: return (void*)VRAMRead; - case 9: return NULL; - case 16: return (void*)VRAMRead; - case 17: return (void*)VRAMWrite; - case 32: return (void*)VRAMRead; - case 33: return (void*)VRAMWrite; - } - break; - } - } - else - { - switch (addr & 0xFF800000) - { - case 0x04000000: - if (addr >= 0x04000400 && addr < 0x04000520) - { - switch (size | store) - { - case 8: return (void*)SPU::Read8; - case 9: return (void*)SPU::Write8; - case 16: return (void*)SPU::Read16; - case 17: return (void*)SPU::Write16; - case 32: return (void*)SPU::Read32; - case 33: return (void*)SPU::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM7IORead8; - case 9: return (void*)NDS::ARM7IOWrite8; - case 16: return (void*)NDS::ARM7IORead16; - case 17: return (void*)NDS::ARM7IOWrite16; - case 32: return (void*)NDS::ARM7IORead32; - case 33: return (void*)NDS::ARM7IOWrite32; - } - break; - case 0x04800000: - if (addr < 0x04810000 && size >= 16) - { - switch (size | store) - { - case 16: return (void*)Wifi::Read; - case 17: return (void*)Wifi::Write; - case 32: return (void*)WifiRead32; - case 33: return (void*)WifiWrite32; - } - } - break; - case 0x06000000: - case 0x06800000: - switch (size | store) - { - case 8: return (void*)GPU::ReadVRAM_ARM7; - case 9: return (void*)GPU::WriteVRAM_ARM7; - case 16: return (void*)GPU::ReadVRAM_ARM7; - case 17: return (void*)GPU::WriteVRAM_ARM7; - case 32: return (void*)GPU::ReadVRAM_ARM7; - case 33: return (void*)GPU::WriteVRAM_ARM7; - } - } - } - return NULL; -} - } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 44a6140..2320b7b 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,32 +9,7 @@ namespace ARMJIT { -enum ExeMemKind -{ - exeMem_Unmapped = 0, - exeMem_ITCM, - exeMem_MainRAM, - exeMem_SWRAM, - exeMem_LCDC, - exeMem_ARM9_BIOS, - exeMem_ARM7_BIOS, - exeMem_ARM7_WRAM, - exeMem_ARM7_WVRAM, - exeMem_Count -}; - -extern const u32 ExeMemRegionOffsets[]; -extern const u32 ExeMemRegionSizes[]; - -typedef u32 (*JitBlockEntry)(); - -const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... - -u32 TranslateAddr9(u32 addr); -u32 TranslateAddr7(u32 addr); - -template -JitBlockEntry LookUpBlockEntry(u32 addr); +typedef void (*JitBlockEntry)(); void Init(); void DeInit(); @@ -43,44 +18,15 @@ void Reset(); void InvalidateByAddr(u32 pseudoPhysical); -void InvalidateRegionIfNecessary(u32 addr); - -inline void InvalidateMainRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); -} -inline void InvalidateITCMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); -} -inline void InvalidateLCDCIfNecessary(u32 addr) -{ - if (addr < 0x68A3FFF) - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); -} -inline void InvalidateSWRAM7IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); -} -inline void InvalidateSWRAM9IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); -} -inline void InvalidateARM7WRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); -} -inline void InvalidateARM7WVRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); -} +template +void CheckAndInvalidate(u32 addr); void CompileBlock(ARM* cpu); void ResetBlockCache(); -void UpdateMemoryStatus9(u32 start, u32 end); -void UpdateMemoryStatus7(u32 start, u32 end); +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr); +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size); } diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp index 0fe6a97..5f021a0 100644 --- a/src/ARMJIT_A64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp @@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S && !CurInstr.SetFlags) S = false; - bool CVInGP = false; + bool CVInGPR = false; switch (op) { case 0x2: // SUB @@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 UBFX(W2, RCPSR, 29, 1); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, rn, W2); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption()); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 MVN(W1, rn); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S) { - if (CVInGP) + if (CVInGPR) { BFI(RCPSR, W2, 29, 1); BFI(RCPSR, W3, 28, 1); } - Comp_RetriveFlags(!CVInGP); + Comp_RetriveFlags(!CVInGPR); } } @@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp() MOVI2R(rd, op2.Imm); } else - MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + { + // ORR with shifted operand has cycles latency + if (op2.Reg.ShiftAmount > 0) + { + switch (op2.Reg.ShiftType) + { + case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + } + } + else + { + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + } + } } if (S) @@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + CLS(W0, rs); Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long() } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + if (sign) + CLS(W0, rs); + else + CLZ(W0, rs); Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long() Comp_RetriveFlags(false); } +void Compiler::A_Comp_Mul_Short() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool x = CurInstr.Instr & (1 << 5); + bool y = CurInstr.Instr & (1 << 6); + + SBFX(W1, rs, y ? 16 : 0, 16); + + if (op == 0b1000) + { + // SMLAxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(W0, W0, W1); + + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + + Comp_AddCycles_C(); + } + else if (op == 0b1011) + { + // SMULxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(rd, W0, W1); + + Comp_AddCycles_C(); + } + else if (op == 0b1010) + { + // SMLALxy + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + MOV(W2, rn); + BFI(X2, rd, 32, 32); + + SBFX(W0, rm, x ? 16 : 0, 16); + + SMADDL(EncodeRegTo64(rn), W0, W1, X2); + + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + + Comp_AddCycles_CI(1); + } + else if (op == 0b1001) + { + // SMLAWy/SMULWy + SMULL(X0, rm, W1); + ASR(x ? EncodeRegTo64(rd) : X0, X0, 16); + + if (!x) + { + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + } + + Comp_AddCycles_C(); + } +} + void Compiler::A_Comp_Mul() { ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index 542f0b7..f130938 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } @@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind) AlignCode16(); void* res = GetRXPtr(); - MOVI2R(W2, kCodeCacheTiming); - // W1 - code cycles non branch - // W2 - branch code cycles LSR(W1, W0, 12); - LSL(W1, W1, 2); ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2); LDRB(W1, RCPU, W1); - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); + LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize)); STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles)); - CMP(W0, W3); - FixupBranch outsideITCM = B(CC_LO); - MOVI2R(W1, 1); - MOVI2R(W2, 1); - SetJumpTarget(outsideITCM); + CMP(W1, 0xFF); + MOVI2R(W3, kCodeCacheTiming); + CSEL(W1, W3, W1, CC_EQ); + CMP(W0, W2); + CSINC(W1, W1, WZR, CC_HS); FixupBranch switchToThumb; if (kind == 0) @@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind) if (kind == 0 || kind == 1) { - ANDI2R(W0, W0, ~3); - + // ARM if (kind == 0) ANDI2R(RCPSR, RCPSR, ~0x20); - ADD(W3, W0, 4); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); + ANDI2R(W0, W0, ~3); + ADD(W0, W0, 4); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); + ADD(W1, W1, W1); + SUB(RCycles, RCycles, W1); RET(); } + if (kind == 0 || kind == 2) { + // Thumb if (kind == 0) { SetJumpTarget(switchToThumb); - ORRI2R(RCPSR, RCPSR, 0x20); } ANDI2R(W0, W0, ~1); + ADD(W0, W0, 2); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); - ADD(W3, W0, 2); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - FixupBranch halfwordLoc = TBZ(W0, 1); - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); - RET(); - - SetJumpTarget(halfwordLoc); - ADD(RCycles, RCycles, W2); + ADD(W2, W1, W1); + TSTI2R(W0, 0x2); + CSEL(W1, W1, W2, CC_EQ); + SUB(RCycles, RCycles, W1); RET(); } @@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 0, 8); UBFX(W3, W3, 8, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~3); @@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 16, 8); UBFX(W3, W3, 24, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~1); @@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto } else { - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); - bool previouslyDirty = CPSRDirty; + + bool cpsrDirty = CPSRDirty; SaveCPSR(); - - if (restoreCPSR) - { - if (Thumb || CurInstr.Cond() >= 0xE) - RegCache.Flush(); - else - { - // the ugly way... - // we only save them, to load and save them again - for (int reg : hiRegsLoaded) - SaveReg(reg, RegCache.Mapping[reg]); - } - } + SaveCycles(); + PushRegs(restoreCPSR); if (switchThumb) MOV(W1, addr); @@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto QuickCallFunction(X3, jumpToTrampoline); else QuickCallFunction(X3, jumpToTrampoline); - - if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) - { - for (int reg : hiRegsLoaded) - LoadReg(reg, RegCache.Mapping[reg]); - } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + PopRegs(restoreCPSR); + LoadCycles(); + LoadCPSR(); + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } } @@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); FixupBranch skipFailed = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipFailed); } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index a67f357..42435ed 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -1,9 +1,3 @@ -#include "ARMJIT_Compiler.h" - -#include "../ARMInterpreter.h" - -#include "../ARMJIT_Internal.h" - #ifdef __SWITCH__ #include "../switch/compat_switch.h" @@ -13,10 +7,17 @@ extern char __start__; #include #endif +#include "ARMJIT_Compiler.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMInterpreter.h" +#include "../Config.h" + #include using namespace Arm64Gen; +extern "C" void ARM_Ret(); namespace ARMJIT { @@ -28,7 +29,10 @@ namespace ARMJIT like x64. At one hand you can translate a lot of instructions directly. But at the same time, there are a ton of exceptions, like for example ADD and SUB can't have a RORed second operand on ARMv8. - */ + + While writing a JIT when an instruction is recompiled into multiple ones + not to write back until you've read all the other operands! +*/ template <> const ARM64Reg RegisterCache::NativeRegAllocOrder[] = @@ -46,6 +50,132 @@ void Compiler::MovePC() ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); } +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + MOV(rd, W3); + } + else + MOV(rd, RCPSR); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + ARM64Reg val; + if (CurInstr.Instr & (1 << 25)) + { + val = W0; + MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))); + } + else + { + val = MapReg(CurInstr.A_Reg(0)); + } + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + + MOVI2R(W1, mask); + MOVI2R(W2, mask & 0xFFFFFF00); + ANDI2R(W5, RCPSR, 0x1F); + CMP(W5, 0x10); + CSEL(W1, W2, W1, CC_EQ); + + BIC(W3, W3, W1); + AND(W0, val, W1); + ORR(W3, W3, W0); + + MOVI2R(W1, 15 - 8); + + BL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + ANDI2R(RCPSR, RCPSR, ~mask); + ANDI2R(W0, val, mask); + ORR(RCPSR, RCPSR, W0); + } + else + { + MOVI2R(W2, mask); + MOVI2R(W3, mask & 0xFFFFFF00); + ANDI2R(W1, RCPSR, 0x1F); + // W1 = first argument + CMP(W1, 0x10); + CSEL(W2, W3, W2, CC_EQ); + + BIC(RCPSR, RCPSR, W2); + AND(W0, val, W2); + ORR(RCPSR, RCPSR, W0); + + MOV(W2, RCPSR); + MOV(X0, RCPU); + + PushRegs(true); + + QuickCallFunction(X3, (void*)&ARM::UpdateMode); + + PopRegs(true); + } + } +} + +void Compiler::PushRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + if (Thumb || CurInstr.Cond() == 0xE) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + RegCache.UnloadRegister(reg); + } + else + { + BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsDirty) + SaveReg(reg, RegCache.Mapping[reg]); + } + } +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } +} + Compiler::Compiler() { #ifdef __SWITCH__ @@ -80,8 +210,7 @@ Compiler::Compiler() assert(succeded); SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); - JitMemUseableSize = JitMemSize; - Reset(); + JitMemMainSize = JitMemSize; #else u64 pageSize = sysconf(_SC_PAGE_SIZE); u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); @@ -90,31 +219,8 @@ Compiler::Compiler() SetCodeBase(pageAligned, pageAligned); JitMemUseableSize = alignedSize; - Reset(); #endif - - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - { - MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j); - } - } - MemFunc7[0][0] = (void*)NDS::ARM7Read8; - MemFunc7[1][0] = (void*)NDS::ARM7Read16; - MemFunc7[2][0] = (void*)NDS::ARM7Read32; - MemFunc7[0][1] = (void*)NDS::ARM7Write8; - MemFunc7[1][1] = (void*)NDS::ARM7Write16; - MemFunc7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - { - for (int j = 0; j < 2; j++) - { - MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j); - MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j); - } - } + SetCodePtr(0); for (int i = 0; i < 3; i++) { @@ -123,26 +229,26 @@ Compiler::Compiler() } /* - W0 - mode + W5 - mode W1 - reg num W3 - in/out value of reg */ { ReadBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); RET(); @@ -166,19 +272,19 @@ Compiler::Compiler() { WriteBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); MOVI2R(W4, 0); @@ -206,9 +312,71 @@ Compiler::Compiler() RET(); } - //FlushIcache(); + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 8; reg++) + { + ARM64Reg rdMapped = (ARM64Reg)(W19 + reg); + PatchedStoreFuncs[num][size][reg] = GetRXPtr(); + if (num == 0) + { + MOV(X1, RCPU); + MOV(W2, rdMapped); + } + else + { + MOV(W1, rdMapped); + } + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowWrite9); break; + case 33: QuickCallFunction(X3, SlowWrite7); break; + case 16: QuickCallFunction(X3, SlowWrite9); break; + case 17: QuickCallFunction(X3, SlowWrite7); break; + case 8: QuickCallFunction(X3, SlowWrite9); break; + case 9: QuickCallFunction(X3, SlowWrite7); break; + } + ABI_PopRegisters({30}); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr(); + if (num == 0) + MOV(X1, RCPU); + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowRead9); break; + case 33: QuickCallFunction(X3, SlowRead7); break; + case 16: QuickCallFunction(X3, SlowRead9); break; + case 17: QuickCallFunction(X3, SlowRead7); break; + case 8: QuickCallFunction(X3, SlowRead9); break; + case 9: QuickCallFunction(X3, SlowRead7); break; + } + ABI_PopRegisters({30}); + if (size == 32) + MOV(rdMapped, W0); + else if (signextend) + SBFX(rdMapped, W0, 0, 8 << size); + else + UBFX(rdMapped, W0, 0, 8 << size); + RET(); + } + } + } + } + + FlushIcache(); + + JitMemSecondarySize = 1024*1024*4; + + JitMemMainSize -= GetCodeOffset(); + JitMemMainSize -= JitMemSecondarySize; - JitMemUseableSize -= GetCodeOffset(); SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr()); } @@ -227,6 +395,16 @@ Compiler::~Compiler() #endif } +void Compiler::LoadCycles() +{ + LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + +void Compiler::SaveCycles() +{ + STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + void Compiler::LoadReg(int reg, ARM64Reg nativeReg) { if (reg == 15) @@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // CMN F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), // Mul - F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, + F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), // ARMv5 exclusives F(Clz), NULL, NULL, NULL, NULL, @@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // Branch F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg), // Special - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL, &Compiler::Nop }; #undef F @@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind) return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; } -void Compiler::Comp_BranchSpecialBehaviour() +void Compiler::Comp_BranchSpecialBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) + if (taken && CurInstr.BranchFlags & branch_IdleBranch) { MOVI2R(W0, 1); STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop)); } - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { - SaveCPSR(false); RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); + + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); } } JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (JitMemUseableSize - GetCodeOffset() < 1024 * 16) + if (JitMemMainSize - GetCodeOffset() < 1024 * 16) + { + printf("JIT near memory full, resetting...\n"); + ResetBlockCache(); + } + if ((JitMemMainSize + JitMemSecondarySize) - OtherCodeRegion < 1024 * 8) { - printf("JIT memory full, resetting...\n"); + printf("JIT far memory full, resetting...\n"); ResetBlockCache(); } @@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] CurCPU = cpu; ConstantCycles = 0; RegCache = RegisterCache(this, instrs, instrsCount, true); - - //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4)); - const u32 ALL_CALLEE_SAVED = 0x7FF80000; - - SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED)); - - //if (Num == 1) - { - ABI_PushRegisters(SavedRegs); - - MOVP2R(RCPU, CurCPU); - MOVI2R(RCycles, 0); - - LoadCPSR(); - } + CPSRDirty = false; for (int i = 0; i < instrsCount; i++) { @@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] if (comp == NULL) { + SaveCycles(); SaveCPSR(); RegCache.Flush(); } @@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] (this->*comp)(); } - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); if (cond < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipNop = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipNop); } @@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } if (comp == NULL) + { + LoadCycles(); LoadCPSR(); + } } RegCache.Flush(); - //if (Num == 1) - { - SaveCPSR(); - - ADD(W0, RCycles, ConstantCycles); - - ABI_PopRegisters(SavedRegs); - } - //else - // ADD(RCycles, RCycles, ConstantCycles); - - RET(); + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); FlushIcache(); - //printf("finished\n"); - return res; } void Compiler::Reset() { + LoadStorePatches.clear(); + SetCodePtr(0); + OtherCodeRegion = JitMemMainSize; const u32 brk_0 = 0xD4200000; - for (int i = 0; i < JitMemUseableSize / 4; i++) + for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++) *(((u32*)GetRWPtr()) + i) = brk_0; } -void Compiler::Comp_AddCycles_C(bool nonConst) +void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (!nonConst && !CurInstr.Info.Branches()) + if (forceNonConstant) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 numI) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI; - if (Thumb || CurInstr.Cond() >= 0xE) + if (Thumb || CurInstr.Cond() == 0xE) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c; - ADD(RCycles, RCycles, numI, shift); + SUB(RCycles, RCycles, cycles); if (Thumb || CurInstr.Cond() >= 0xE) - ConstantCycles += c; + ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CDI() @@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } @@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD() } if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index 5c9ef41..e4ffc63 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -9,6 +9,8 @@ #include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" +#include + namespace ARMJIT { @@ -64,7 +66,14 @@ struct Op2 }; }; -class Compiler : Arm64Gen::ARM64XEmitter +struct LoadStorePatch +{ + void* PatchFunc; + s32 PatchOffset; + u32 PatchSize; +}; + +class Compiler : public Arm64Gen::ARM64XEmitter { public: typedef void (Compiler::*CompileFunc)(); @@ -72,6 +81,9 @@ public: Compiler(); ~Compiler(); + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + Arm64Gen::ARM64Reg MapReg(int reg) { assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG); @@ -89,7 +101,7 @@ public: void Reset(); - void Comp_AddCycles_C(bool forceNonConst = false); + void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 numI); void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift); void Comp_AddCycles_CD(); @@ -103,6 +115,9 @@ public: void LoadCPSR(); void SaveCPSR(bool markClean = true); + void LoadCycles(); + void SaveCycles(); + void Nop() {} void A_Comp_ALUTriOp(); @@ -111,6 +126,7 @@ public: void A_Comp_Mul(); void A_Comp_Mul_Long(); + void A_Comp_Mul_Short(); void A_Comp_Clz(); @@ -122,6 +138,8 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void A_Comp_MRS(); + void A_Comp_MSR(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -168,7 +186,7 @@ public: void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0); void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); - void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); enum { memop_Writeback = 1 << 0, @@ -179,16 +197,33 @@ public: }; void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags); - void* Gen_MemoryRoutine9(int size, bool store); - - void* Gen_MemoryRoutine9Seq(bool store, bool preinc); - void* Gen_MemoryRoutine7Seq(bool store, bool preinc); - // 0 = switch mode, 1 = stay arm, 2 = stay thumb void* Gen_JumpTo9(int kind); void* Gen_JumpTo7(int kind); - void Comp_BranchSpecialBehaviour(); + void Comp_BranchSpecialBehaviour(bool taken); + + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(GetRXBase() + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - GetRXBase(); + } + + bool IsJITFault(u64 pc); + s64 RewriteMemAccess(u64 pc); + + void SwapCodeRegion() + { + ptrdiff_t offset = GetCodeOffset(); + SetCodePtrUnsafe(OtherCodeRegion); + OtherCodeRegion = offset; + } + + ptrdiff_t OtherCodeRegion; bool Exit; @@ -202,22 +237,20 @@ public: BitSet32 SavedRegs; - u32 JitMemUseableSize; + u32 JitMemSecondarySize; + u32 JitMemMainSize; void* ReadBanked, *WriteBanked; - // [size][store] - void* MemFunc9[3][2]; - void* MemFunc7[3][2]; - - // [store][pre increment] - void* MemFuncsSeq9[2][2]; - // "[code in main ram] - void* MemFuncsSeq7[2][2]; - void* JumpToFuncs9[3]; void* JumpToFuncs7[3]; + std::unordered_map LoadStorePatches; + + // [Num][Size][Sign Extend][Output register] + void* PatchedLoadFuncs[2][3][2][8]; + void* PatchedStoreFuncs[2][3][8]; + RegisterCache RegCache; bool CPSRDirty = false; diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s new file mode 100644 index 0000000..536a478 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Linkage.s @@ -0,0 +1,68 @@ +#include "../ARMJIT_x64/ARMJIT_Offsets.h" + +.text + +#define RCPSR W27 +#define RCycles W28 +#define RCPU X29 + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: + stp x19, x20, [sp, #-96]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + mov RCPU, x0 + ldr RCycles, [RCPU, ARM_Cycles_offset] + ldr RCPSR, [RCPU, ARM_CPSR_offset] + + br x1 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + str RCycles, [RCPU, ARM_Cycles_offset] + str RCPSR, [RCPU, ARM_CPSR_offset] + + ldp x29, x30, [sp, #80] + ldp x27, x28, [sp, #64] + ldp x25, x26, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #96 + + ret + +.p2align 4,,15 + +.global ARM_RestoreContext +ARM_RestoreContext: + mov sp, x0 + + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + ldp x16, x17, [sp, #128] + ldp x18, x19, [sp, #144] + ldp x20, x21, [sp, #160] + ldp x22, x23, [sp, #176] + ldp x24, x25, [sp, #192] + ldp x26, x27, [sp, #208] + ldp x28, x29, [sp, #224] + ldr x30, [sp, #240] + + ldp x17, x18, [sp, #248] + mov sp, x17 + + br x18 \ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 6cf710b..b307d0e 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -2,286 +2,62 @@ #include "../Config.h" +#include "../ARMJIT_Memory.h" + using namespace Arm64Gen; namespace ARMJIT { -// W0 - address -// (if store) W1 - value to store -// W2 - code cycles -void* Compiler::Gen_MemoryRoutine9(int size, bool store) +bool Compiler::IsJITFault(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - u32 addressMask; - switch (size) - { - case 32: addressMask = ~3; break; - case 16: addressMask = ~1; break; - case 8: addressMask = ~0; break; - } - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W3, W0, W3); - CMP(W3, W4); - FixupBranch insideDTCM = B(CC_LO); - - UBFX(W4, W0, 24, 8); - CMP(W4, 0x02); - FixupBranch outsideMainRAM = B(CC_NEQ); - ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1)); - MOVP2R(X4, NDS::MainRAM); - if (!store && size == 32) - { - LDR(W3, X3, X4); - ANDI2R(W0, W0, 3); - LSL(W0, W0, 3); - RORV(W0, W3, W0); - } - else if (store) - STRGeneric(size, W1, X3, X4); - else - LDRGeneric(size, false, W0, X3, X4); - RET(); - - SetJumpTarget(outsideMainRAM); - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W3); - FixupBranch insideITCM = B(CC_LO); - - if (store) - { - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickTailCall(X4, NDS::ARM9Write32); break; - case 16: QuickTailCall(X4, NDS::ARM9Write16); break; - case 8: QuickTailCall(X4, NDS::ARM9Write8); break; - } - } - else - { - if (size == 32) - ABI_PushRegisters({0, 30}); - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickCallFunction(X4, NDS::ARM9Read32); break; - case 16: QuickTailCall (X4, NDS::ARM9Read16); break; - case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break; - } - if (size == 32) - { - ABI_PopRegisters({1, 30}); - ANDI2R(W1, W1, 3); - LSL(W1, W1, 3); - RORV(W0, W0, W1); - RET(); - } - } - - SetJumpTarget(insideDTCM); - ANDI2R(W3, W3, 0x3FFF & addressMask); - ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - - RET(); - - SetJumpTarget(insideITCM); - ANDI2R(W3, W0, 0x7FFF & addressMask); - if (store) - { - ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4); - LSR(W5, W0, 9); - MOVP2R(X4, CodeRanges); - ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W4); - ABI_PushRegisters({1, 3, 30}); - QuickCallFunction(X4, InvalidateByAddr); - ABI_PopRegisters({1, 3, 30}); - SetJumpTarget(null); - } - ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - RET(); - - return res; + return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); } -/* - W0 - base address - X1 - stack space - W2 - values count -*/ -void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc) +s64 Compiler::RewriteMemAccess(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); + ptrdiff_t pcOffset = pc - (u64)GetRXBase(); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W4, W0, W4); - CMP(W4, W5); - FixupBranch insideDTCM = B(CC_LO); + auto it = LoadStorePatches.find(pcOffset); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W4); - FixupBranch insideITCM = B(CC_LO); - - ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once - if (store) + if (it != LoadStorePatches.end()) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM9Write32); + LoadStorePatch patch = it->second; - ABI_PopRegisters({0, 1, 2, 30}); - } - else - { - QuickCallFunction(X4, NDS::ARM9Read32); - MOV(W4, W0); + ptrdiff_t curCodeOffset = GetCodeOffset(); - ABI_PopRegisters({0, 1, 2, 30}); + SetCodePtrUnsafe(pcOffset + patch.PatchOffset); - STR(X4, X1, ArithOption(X2, true)); - } + BL(patch.PatchFunc); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); + for (int i = 0; i < patch.PatchSize / 4 - 1; i++) + HINT(HINT_NOP); - SetJumpTarget(insideDTCM); + FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr()); - ANDI2R(W4, W4, ~3 & 0x3FFF); - ADDI2R(X4, X4, offsetof(ARMv5, DTCM)); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X4); - } - else - { - LDR(W5, RCPU, X4); - STR(X5, X1, ArithOption(X2, true)); - } + SetCodePtrUnsafe(curCodeOffset); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - - SetJumpTarget(insideITCM); - - ANDI2R(W4, W0, ~3 & 0x7FFF); - - ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X6); - } - else - { - LDR(W5, RCPU, X6); - STR(X5, X1, ArithOption(X2, true)); - } + LoadStorePatches.erase(it); - if (store) - { - ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5); - LSR(W6, W4, 9); - MOVP2R(X5, CodeRanges); - ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W5); - ABI_PushRegisters({0, 1, 2, 4, 30}); - MOV(W0, W4); - QuickCallFunction(X5, InvalidateByAddr); - ABI_PopRegisters({0, 1, 2, 4, 30}); - SetJumpTarget(null); + return patch.PatchOffset; } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; + printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc)); + assert(false); } -void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc) +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) { - AlignCode16(); - void* res = GetRXPtr(); + u32 localAddr = LocaliseCodeAddress(Num, addr); - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); - - ABI_PushRegisters({0, 1, 2, 30}); - if (store) + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); + if (invalidLiteralIdx != -1) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM7Write32); - ABI_PopRegisters({0, 1, 2, 30}); + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - else - { - QuickCallFunction(X4, NDS::ARM7Read32); - MOV(W4, W0); - ABI_PopRegisters({0, 1, 2, 30}); - STR(X4, X1, ArithOption(X2, true)); - } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; -} + Comp_AddCycles_CDI(); -void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) if (Thumb || CurInstr.Cond() == 0xE) RegCache.PutLiteral(rd, val); + + return true; } void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) @@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) addressMask = ~3; if (size == 16) addressMask = ~1; + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) + return; + } if (flags & memop_Store) Comp_AddCycles_CD(); else Comp_AddCycles_CDI(); - if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) - { - u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rnMapped = MapReg(rn); - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr); - return; - } + if (Thumb && rn == 15) + { + ANDI2R(W3, rnMapped, ~2); + rnMapped = W3; } + ARM64Reg finalAddr = W0; + if (flags & memop_Post) { - ARM64Reg rdMapped = MapReg(rd); - ARM64Reg rnMapped = MapReg(rn); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; + finalAddr = rnMapped; + MOV(W0, rnMapped); + } - void* memFunc = Num == 0 - ? MemFunc9[size >> 4][!!(flags & memop_Store)] - : MemFunc7[size >> 4][!!((flags & memop_Store))]; + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn)) + if (!offset.IsImm) + Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); + // offset might has become an immediate + if (offset.IsImm) + { + if (offset.Imm) + { + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Imm); + else + ADD(finalAddr, rnMapped, offset.Imm); + } + else if (finalAddr != rnMapped) + MOV(finalAddr, rnMapped); + } + else + { + if (offset.Reg.ShiftType == ST_ROR) { - u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); + offset = Op2(W0); + } - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) - { - ARMv5* cpu5 = (ARMv5*)CurCPU; + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + else + ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + } - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - region.Mem = cpu5->DTCM; - region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } - } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + if (!(flags & memop_Post) && (flags & memop_Writeback)) + MOV(rnMapped, W0); - if (region.Mem != NULL) - { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - MOVP2R(X0, ptr); - if (flags & memop_Store) - STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0); - else - { - LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0); - if (size == 32 && addr & ~0x3) - ROR_(rdMapped, rdMapped, (addr & 0x3) << 3); - } - return; - } + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget))) + { + ptrdiff_t memopStart = GetCodeOffset(); + LoadStorePatch patch; - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } - } + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19] + : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19]; + assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8); - ARM64Reg finalAddr = W0; - if (flags & memop_Post) - { - finalAddr = rnMapped; - MOV(W0, rnMapped); - } + MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + // take a chance at fastmem + if (size > 8) + ANDI2R(W1, W0, addressMask); + + ptrdiff_t loadStorePosition = GetCodeOffset(); if (flags & memop_Store) - MOV(W1, rdMapped); - - if (!offset.IsImm) - Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); - // offset might become an immediate - if (offset.IsImm) { - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Imm); - else - ADD(finalAddr, rnMapped, offset.Imm); + STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7); } else { - if (offset.Reg.ShiftType == ST_ROR) + LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); + if (size == 32) { - ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); - offset = Op2(W0); + UBFIZ(W0, W0, 3, 2); + RORV(rdMapped, rdMapped, W0); } - - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); - else - ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); } - if (!(flags & memop_Post) && (flags & memop_Writeback)) - MOV(rnMapped, W0); + patch.PatchOffset = memopStart - loadStorePosition; + patch.PatchSize = GetCodeOffset() - memopStart; + LoadStorePatches[loadStorePosition] = patch; + } + else + { + void* func = NULL; + if (addrIsStatic) + func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size); - if (inlinePreparation) + if (func) { - if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4) - ANDI2R(rdMapped, W0, 3); - if (size > 8) - ANDI2R(W0, W0, addressMask); + if (flags & memop_Store) + MOV(W1, rdMapped); + QuickCallFunction(X2, (void (*)())func); + + if (!(flags & memop_Store)) + { + if (size == 32) + { + if (staticAddress & 0x3) + ROR_(rdMapped, W0, (staticAddress & 0x3) << 3); + else + MOV(rdMapped, W0); + } + else + { + if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); + } + } } - QuickCallFunction(X2, memFunc); - if (!(flags & memop_Store)) + else { - if (inlinePreparation && !(flags & memop_Store) && size == 32) + if (Num == 0) { - if (constLocalROR32 == 4) + MOV(X1, RCPU); + if (flags & memop_Store) { - LSL(rdMapped, rdMapped, 3); - RORV(rdMapped, W0, rdMapped); + MOV(W2, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite9); break; + case 16: QuickCallFunction(X3, SlowWrite9); break; + case 8: QuickCallFunction(X3, SlowWrite9); break; + } } - else if (constLocalROR32 > 0) - ROR_(rdMapped, W0, constLocalROR32 << 3); else - MOV(rdMapped, W0); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead9); break; + case 16: QuickCallFunction(X3, SlowRead9); break; + case 8: QuickCallFunction(X3, SlowRead9); break; + } + } } - else if (flags & memop_SignExtend) + else { - if (size == 16) - SXTH(rdMapped, W0); - else if (size == 8) - SXTB(rdMapped, W0); + if (flags & memop_Store) + { + MOV(W1, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite7); break; + case 16: QuickCallFunction(X3, SlowWrite7); break; + case 8: QuickCallFunction(X3, SlowWrite7); break; + } + } else - assert("What's wrong with you?"); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead7); break; + case 16: QuickCallFunction(X3, SlowRead7); break; + case 8: QuickCallFunction(X3, SlowRead7); break; + } + } } - else - MOV(rdMapped, W0); - - if (CurInstr.Info.Branches()) + + if (!(flags & memop_Store)) { - if (size < 32) - printf("LDR size < 32 branching?\n"); - Comp_JumpTo(rdMapped, Num == 0, false); + if (size == 32) + MOV(rdMapped, W0); + else if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); } } } + + if (CurInstr.Info.Branches()) + { + if (size < 32) + printf("LDR size < 32 branching?\n"); + Comp_JumpTo(rdMapped, Num == 0, false); + } } void Compiler::A_Comp_MemWB() @@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + u32 offset = ((CurInstr.Instr & 0xFF) << 2); + u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - { - Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr); - Comp_AddCycles_CDI(); - } - else - { - bool negative = addr < R15; - u32 abs = negative ? R15 - addr : addr - R15; - Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0); - } + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() @@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (regsCount == 0) return 0; // actually not the right behaviour TODO: fix me - SUB(SP, SP, ((regsCount + 1) & ~1) * 8); - if (store) + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + + if (store) Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); - if (usermode && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + int expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + bool compileFastPath = Config::JIT_FastMemory + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget)); + + if (decrement) + { + SUB(W0, MapReg(rn), regsCount * 4); + ANDI2R(W0, W0, ~3); + preinc ^= true; + } + else + { + ANDI2R(W0, MapReg(rn), ~3); + } + + LoadStorePatch patch; + if (compileFastPath) + { + ptrdiff_t fastPathStart = GetCodeOffset(); + ptrdiff_t firstLoadStoreOffset; + + bool firstLoadStore = true; + + MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + ADD(X1, X1, X0); + + u32 offset = preinc ? 4 : 0; + BitSet16::Iterator it = regs.begin(); + + if (regsCount & 1) + { + int reg = *it; + it++; + + ARM64Reg first = W3; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STR(INDEX_UNSIGNED, first, X1, offset); + else + LDR(INDEX_UNSIGNED, first, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + + offset += 4; + } + + while (it != regs.end()) + { + int reg = *it; + it++; + int nextReg = *it; + it++; - int i = regsCount - 1; + ARM64Reg first = W3, second = W4; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + if (RegCache.LoadedRegs & (1 << nextReg)) + second = MapReg(nextReg); + else if (store) + LoadReg(nextReg, second); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STP(INDEX_SIGNED, first, second, X1, offset); + else + LDP(INDEX_SIGNED, first, second, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store) + SaveReg(nextReg, second); + + offset += 8; + } + + patch.PatchSize = GetCodeOffset() - fastPathStart; + patch.PatchOffset = fastPathStart - firstLoadStoreOffset; + SwapCodeRegion(); + patch.PatchFunc = GetRXPtr(); + + LoadStorePatches[firstLoadStoreOffset] = patch; + + ABI_PushRegisters({30}); + } + + int i = 0; + + SUB(SP, SP, ((regsCount + 1) & ~1) * 8); + if (store) + { + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W5, RCPSR, 0, 5); BitSet16::Iterator it = regs.begin(); while (it != regs.end()) @@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (usermode && reg >= 8 && reg < 15) { - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) MOV(W3, MapReg(reg)); else LoadReg(reg, W3); @@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else if (!usermode && nextReg != regs.end()) { - ARM64Reg first = W3; - ARM64Reg second = W4; + ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); else LoadReg(reg, W3); - if (RegCache.Mapping[*nextReg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); else LoadReg(*nextReg, W4); - STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); - i--; + i++; it++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) + { STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8); + } else { LoadReg(reg, W3); STR(INDEX_UNSIGNED, W3, SP, i * 8); } - i--; + i++; it++; } } - if (decrement) - { - SUB(W0, MapReg(rn), regsCount * 4); - preinc ^= true; - } - else - MOV(W0, MapReg(rn)); + ADD(X1, SP, 0); MOVI2R(W2, regsCount); - BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]); + if (Num == 0) + { + MOV(X3, RCPU); + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer9); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer9); break; + } + } + else + { + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer7); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer7); break; + } + } if (!store) { - Comp_AddCycles_CDI(); - if (usermode && !regs[15] && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + UBFX(W5, RCPSR, 0, 5); - int i = regsCount - 1; BitSet16::Iterator it = regs.begin(); while (it != regs.end()) { @@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc MOVI2R(W1, reg - 8); BL(WriteBanked); FixupBranch alreadyWritten = CBNZ(W4); - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) MOV(MapReg(reg), W3); - RegCache.DirtyRegs |= 1 << reg; - } else SaveReg(reg, W3); SetJumpTarget(alreadyWritten); @@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; - } - if (RegCache.Mapping[*nextReg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); - if (*nextReg != 15) - RegCache.DirtyRegs |= 1 << *nextReg; - } - LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); if (first == W3) SaveReg(reg, W3); @@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SaveReg(*nextReg, W4); it++; - i--; + i++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) { ARM64Reg mapped = MapReg(reg); LDR(INDEX_UNSIGNED, mapped, SP, i * 8); - - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; } else { @@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } it++; - i--; + i++; } } ADD(SP, SP, ((regsCount + 1) & ~1) * 8); + if (compileFastPath) + { + ABI_PopRegisters({30}); + RET(); + + FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr()); + SwapCodeRegion(); + } + if (!store && regs[15]) { ARM64Reg mapped = MapReg(15); diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h new file mode 100644 index 0000000..513c103 --- /dev/null +++ b/src/ARMJIT_Compiler.h @@ -0,0 +1,12 @@ +#if defined(__x86_64__) +#include "ARMJIT_x64/ARMJIT_Compiler.h" +#elif defined(__aarch64__) +#include "ARMJIT_A64/ARMJIT_Compiler.h" +#else +#error "The current target platform doesn't have a JIT backend" +#endif + +namespace ARMJIT +{ +extern Compiler* JITCompiler; +} \ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 4e45760..19684c4 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -3,8 +3,11 @@ #include "types.h" #include +#include +#include #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // here lands everything which doesn't fit into ARMJIT.h // where it would be included by pretty much everything @@ -160,8 +163,8 @@ public: Data.SetLength(numAddresses * 2 + numLiterals); } - u32 PseudoPhysicalAddr; - + u32 StartAddr; + u32 StartAddrLocal; u32 InstrHash, LiteralHash; u8 Num; u16 NumAddresses; @@ -175,28 +178,8 @@ public: { return &Data[NumAddresses]; } u32* Literals() { return &Data[NumAddresses * 2]; } - u32* Links() - { return &Data[NumAddresses * 2 + NumLiterals]; } - - u32 NumLinks() - { return Data.Length - NumAddresses * 2 - NumLiterals; } - - void AddLink(u32 link) - { - Data.Add(link); - } - - void ResetLinks() - { - Data.SetLength(NumAddresses * 2 + NumLiterals); - } private: - /* - 0.. Data; }; @@ -207,45 +190,32 @@ struct __attribute__((packed)) AddressRange u32 Code; }; -extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemoryStatus9[0x800000]; -extern u8 MemoryStatus7[0x800000]; - extern TinyVector InvalidLiterals; -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); - -template -void LinkBlock(ARM* cpu, u32 codeOffset); +extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count]; -enum +inline bool PageContainsCode(AddressRange* range) { - memregion_Other = 0, - memregion_ITCM, - memregion_DTCM, - memregion_BIOS9, - memregion_MainRAM, - memregion_SWRAM9, - memregion_SWRAM7, - memregion_IO9, - memregion_VRAM, - memregion_BIOS7, - memregion_WRAM7, - memregion_IO7, - memregion_Wifi, - memregion_VWRAM, -}; + for (int i = 0; i < 8; i++) + { + if (range[i].Blocks.Length > 0) + return true; + } + return false; +} + +u32 LocaliseCodeAddress(u32 num, u32 addr); -int ClassifyAddress9(u32 addr); -int ClassifyAddress7(u32 addr); +template +void LinkBlock(ARM* cpu, u32 codeOffset); -template T SlowRead9(ARMv5* cpu, u32 addr); -template void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template T SlowRead9(u32 addr, ARMv5* cpu); +template void SlowWrite9(u32 addr, ARMv5* cpu, T val); template T SlowRead7(u32 addr); template void SlowWrite7(u32 addr, T val); diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp new file mode 100644 index 0000000..162827d --- /dev/null +++ b/src/ARMJIT_Memory.cpp @@ -0,0 +1,822 @@ +#ifdef __SWITCH__ +#include "switch/compat_switch.h" +#endif + +#include "ARMJIT_Memory.h" + +#include "ARMJIT_Internal.h" +#include "ARMJIT_Compiler.h" + +#include "GPU.h" +#include "GPU3D.h" +#include "Wifi.h" +#include "NDSCart.h" +#include "SPU.h" + +#include + +/* + We're handling fastmem here. + + Basically we're repurposing a big piece of virtual memory + and map the memory regions as they're structured on the DS + in it. + + On most systems you have a single piece of main ram, + maybe some video ram and faster cache RAM and that's about it. + Here we have not only a lot more different memory regions, + but also two address spaces. Not only that but they all have + mirrors (the worst case is 16kb SWRAM which is mirrored 1024x). + + We handle this by only mapping those regions which are actually + used and by praying the games don't go wild. + + Beware, this file is full of platform specific code. + +*/ + +namespace ARMJIT_Memory +{ +#ifdef __aarch64__ +struct FaultDescription +{ + u64 IntegerRegisters[33]; + u64 FaultAddr; + + u32 GetEmulatedAddr() + { + // now this is podracing + return (u32)IntegerRegisters[0]; + } + u64 RealAddr() + { + return FaultAddr; + } + + u64 GetPC() + { + return IntegerRegisters[32]; + } + + void RestoreAndRepeat(s64 offset); +}; +#else +struct FaultDescription +{ + u64 GetPC() + { + return 0; + } + + u32 GetEmulatedAddr() + { + return 0; + } + u64 RealAddr() + { + return 0; + } + + void RestoreAndRepeat(s64 offset); +}; +#endif + +void FaultHandler(FaultDescription* faultDesc); +} + + +#ifdef __aarch64__ + +extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + +#endif + +#ifdef __SWITCH__ +// with LTO the symbols seem to be not properly overriden +// if they're somewhere else + +extern "C" +{ +extern char __start__; +extern char __rodata_start; + +alignas(16) u8 __nx_exception_stack[0x8000]; +u64 __nx_exception_stack_size = 0x8000; + +void __libnx_exception_handler(ThreadExceptionDump* ctx) +{ + ARMJIT_Memory::FaultDescription desc; + memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29); + desc.IntegerRegisters[29] = ctx->fp.x; + desc.IntegerRegisters[30] = ctx->lr.x; + desc.IntegerRegisters[31] = ctx->sp.x; + desc.IntegerRegisters[32] = ctx->pc.x; + + ARMJIT_Memory::FaultHandler(&desc); + + if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) + { + printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); + } + else + { + printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + } +} + +} +#endif + +namespace ARMJIT_Memory +{ + +#ifdef __aarch64__ +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + IntegerRegisters[32] += offset; + + ARM_RestoreContext(IntegerRegisters); +} +#else +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + +} +#endif + +void* FastMem9Start, *FastMem7Start; + +const u32 MemoryTotalSize = + NDS::MainRAMSize + + NDS::SharedWRAMSize + + NDS::ARM7WRAMSize + + DTCMPhysicalSize; + +const u32 MemBlockMainRAMOffset = 0; +const u32 MemBlockSWRAMOffset = NDS::MainRAMSize; +const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize; +const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize; + +const u32 OffsetsPerRegion[memregions_Count] = +{ + UINT32_MAX, + UINT32_MAX, + MemBlockDTCMOffset, + UINT32_MAX, + MemBlockMainRAMOffset, + MemBlockSWRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockARM7WRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, +}; + +enum +{ + memstate_Unmapped, + memstate_MappedRW, + // on switch this is unmapped as well + memstate_MappedProtected, +}; + +u8 MappingStatus9[1 << (32-12)]; +u8 MappingStatus7[1 << (32-12)]; + +#ifdef __SWITCH__ +u8* MemoryBase; +u8* MemoryBaseCodeMem; +#else +u8* MemoryBase; +#endif + +bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size)); + return R_SUCCEEDED(r); +#endif +} + +bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size); + printf("%x\n", r); + return R_SUCCEEDED(r); +#endif +} + +struct Mapping +{ + u32 Addr; + u32 Size, LocalOffset; + u32 Num; + + void Unmap(int region) + { + bool skipDTCM = Num == 0 && region != memregion_DTCM; + u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7; + u32 offset = 0; + while (offset < Size) + { + if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + printf("%x skip\n", NDS::ARM9->DTCMSize); + } + else + { + u32 segmentOffset = offset; + u8 status = statuses[(Addr + offset) >> 12]; + while (statuses[(Addr + offset) >> 12] == status + && offset < Size + && (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase)) + { + assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); + statuses[(Addr + offset) >> 12] = memstate_Unmapped; + offset += 0x1000; + } + + if (status == memstate_MappedRW) + { + u32 segmentSize = offset - segmentOffset; + printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + assert(success); + } + } + } + } +}; +ARMJIT::TinyVector Mappings[memregions_Count]; + +void SetCodeProtection(int region, u32 offset, bool protect) +{ + offset &= ~0xFFF; + printf("set code protection %d %x %d\n", region, offset, protect); + + for (int i = 0; i < Mappings[region].Length; i++) + { + Mapping& mapping = Mappings[region][i]; + + u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (mapping.Num == 0 + && region != memregion_DTCM + && effectiveAddr >= NDS::ARM9->DTCMBase + && effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + + u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); + + printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num); + assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); + states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; + + bool success; + if (protect) + success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + else + success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + assert(success); + } +} + +void RemapDTCM(u32 newBase, u32 newSize) +{ + // this first part could be made more efficient + // by unmapping DTCM first and then map the holes + u32 oldDTCMBase = NDS::ARM9->DTCMBase; + u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize; + + u32 newEnd = newBase + newSize; + + printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd); + // unmap all regions containing the old or the current DTCM mapping + for (int region = 0; region < memregions_Count; region++) + { + if (region == memregion_DTCM) + continue; + + for (int i = 0; i < Mappings[region].Length;) + { + Mapping& mapping = Mappings[region][i]; + + u32 start = mapping.Addr; + u32 end = mapping.Addr + mapping.Size; + + printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); + + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end)); + bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end)); + + if (mapping.Num == 0 && (oldOverlap || newOverlap)) + { + mapping.Unmap(region); + Mappings[region].Remove(i); + } + else + { + i++; + } + } + } + + for (int i = 0; i < Mappings[memregion_DTCM].Length; i++) + { + Mappings[memregion_DTCM][i].Unmap(memregion_DTCM); + } + Mappings[memregion_DTCM].Clear(); +} + +void RemapSWRAM() +{ + printf("remapping SWRAM\n"); + for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++) + { + Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM); + } + Mappings[memregion_SWRAM].Clear(); + for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) + { + Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); + } + Mappings[memregion_WRAM7].Clear(); +} + +bool MapAtAddress(u32 addr) +{ + u32 num = NDS::CurCPU; + + int region = num == 0 + ? ClassifyAddress9(addr) + : ClassifyAddress7(addr); + + if (!IsMappable(region)) + return false; + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize); + + if (!isMapped) + return false; + + // this calculation even works with DTCM + // which doesn't have to be aligned to it's own size + u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart; + + u8* states = num == 0 ? MappingStatus9 : MappingStatus7; + printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset); + bool isExecutable = ARMJIT::CodeMemRegions[region]; + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset; + + // this overcomplicated piece of code basically just finds whole pieces of code memory + // which can be mapped + u32 offset = 0; + bool skipDTCM = num == 0 && region != memregion_DTCM; + while (offset < memorySize) + { + if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + } + else + { + u32 sectionOffset = offset; + bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); + while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) + && offset < memorySize + && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) + { + assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); + states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW; + offset += 0x1000; + } + + u32 sectionSize = offset - sectionOffset; + + if (!hasCode) + { + printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); + bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); + assert(succeded); + } + } + } + + Mapping mapping{mirrorStart, memorySize, memoryOffset, num}; + Mappings[region].Add(mapping); + + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1); + + return true; +} + +void FaultHandler(FaultDescription* faultDesc) +{ + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC())) + { + bool rewriteToSlowPath = true; + + u32 addr = faultDesc->GetEmulatedAddr(); + + if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) + rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr()); + + s64 offset = 0; + if (rewriteToSlowPath) + { + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC()); + } + faultDesc->RestoreAndRepeat(offset); + } +} + +void Init() +{ +#if defined(__SWITCH__) + MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize); + MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + (u64)MemoryBase, MemoryTotalSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + MemoryTotalSize, Perm_Rw)); + assert(succeded); + + // 8 GB of address space, just don't ask... + FastMem9Start = virtmemReserve(0x100000000); + assert(FastMem9Start); + FastMem7Start = virtmemReserve(0x100000000); + assert(FastMem7Start); + + NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset; +#else + MemoryBase = new u8[MemoryTotalSize]; + + NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset; +#endif +} + +void DeInit() +{ +#if defined(__SWITCH__) + virtmemFree(FastMem9Start, 0x100000000); + virtmemFree(FastMem7Start, 0x100000000); + + svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); + virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); + free(MemoryBase); +#else + delete[] MemoryBase; +#endif +} + +void Reset() +{ + for (int region = 0; region < memregions_Count; region++) + { + for (int i = 0; i < Mappings[region].Length; i++) + Mappings[region][i].Unmap(region); + Mappings[region].Clear(); + } + + for (int i = 0; i < sizeof(MappingStatus9); i++) + { + assert(MappingStatus9[i] == memstate_Unmapped); + assert(MappingStatus7[i] == memstate_Unmapped); + } + + printf("done resetting jit mem\n"); +} + +bool IsMappable(int region) +{ + return OffsetsPerRegion[region] != UINT32_MAX; +} + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize) +{ + memoryOffset = 0; + switch (region) + { + case memregion_ITCM: + if (num == 0) + { + mappingStart = 0; + mappingSize = NDS::ARM9->ITCMSize; + memorySize = ITCMPhysicalSize; + return true; + } + return false; + case memregion_DTCM: + if (num == 0) + { + mappingStart = NDS::ARM9->DTCMBase; + mappingSize = NDS::ARM9->DTCMSize; + memorySize = DTCMPhysicalSize; + return true; + } + return false; + case memregion_BIOS9: + if (num == 0) + { + mappingStart = 0xFFFF0000; + mappingSize = 0x10000; + memorySize = 0x1000; + return true; + } + return false; + case memregion_MainRAM: + mappingStart = 0x2000000; + mappingSize = 0x1000000; + memorySize = NDS::MainRAMSize; + return true; + case memregion_SWRAM: + mappingStart = 0x3000000; + if (num == 0 && NDS::SWRAM_ARM9.Mem) + { + mappingSize = 0x1000000; + memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM9.Mask + 1; + return true; + } + else if (num == 1 && NDS::SWRAM_ARM7.Mem) + { + mappingSize = 0x800000; + memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM7.Mask + 1; + return true; + } + return false; + case memregion_VRAM: + if (num == 0) + { + // this is a gross simplification + // mostly to make code on vram working + // it doesn't take any of the actual VRAM mappings into account + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x100000; + return true; + } + return false; + case memregion_BIOS7: + if (num == 1) + { + mappingStart = 0; + mappingSize = 0x4000; + memorySize = 0x4000; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + if (NDS::SWRAM_ARM7.Mem) + { + mappingStart = 0x3800000; + mappingSize = 0x800000; + } + else + { + mappingStart = 0x3000000; + mappingSize = 0x1000000; + } + memorySize = NDS::ARM7WRAMSize; + return true; + } + return false; + case memregion_VWRAM: + if (num == 1) + { + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x20000; + return true; + } + return false; + default: + // for the JIT we don't are about the rest + return false; + } +} + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM9.Mem) + return memregion_SWRAM; + else + return memregion_Other; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7.Mem) + return memregion_SWRAM; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} +template +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } +} + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + switch (addr & 0xFF000000) + { + case 0x04000000: + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead; + case 9: return NULL; + case 16: return (void*)VRAMRead; + case 17: return (void*)VRAMWrite; + case 32: return (void*)VRAMRead; + case 33: return (void*)VRAMWrite; + } + break; + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size >= 16) + { + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } + } + break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7; + case 9: return (void*)GPU::WriteVRAM_ARM7; + case 16: return (void*)GPU::ReadVRAM_ARM7; + case 17: return (void*)GPU::WriteVRAM_ARM7; + case 32: return (void*)GPU::ReadVRAM_ARM7; + case 33: return (void*)GPU::WriteVRAM_ARM7; + } + } + } + return NULL; +} + +} \ No newline at end of file diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h new file mode 100644 index 0000000..1a59d98 --- /dev/null +++ b/src/ARMJIT_Memory.h @@ -0,0 +1,53 @@ +#ifndef ARMJIT_MEMORY +#define ARMJIT_MEMORY + +#include "types.h" + +#include "ARM.h" + +namespace ARMJIT_Memory +{ + +extern void* FastMem9Start; +extern void* FastMem7Start; + +void Init(); +void DeInit(); + +void Reset(); + +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, + memregions_Count +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize); + +bool IsMappable(int region); + +void RemapDTCM(u32 newBase, u32 newSize); +void RemapSWRAM(); + +void SetCodeProtection(int region, u32 offset, bool protect); + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fd3fb70..34c1c91 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -301,24 +301,6 @@ Compiler::Compiler() RET(); } - { - CPSRDirty = true; - BranchStub[0] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<0>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - - CPSRDirty = true; - BranchStub[1] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<1>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - } - // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -520,6 +502,11 @@ void Compiler::Reset() FarCode = FarStart; } +bool Compiler::IsJITFault(u64 addr) +{ + return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory); +} + void Compiler::Comp_SpecialBranchBehaviour(bool taken) { if (taken && CurInstr.BranchFlags & branch_IdleBranch) @@ -531,32 +518,11 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) RegCache.PrepareExit(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) - && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)&ARM_Ret, true); - } + JMP((u8*)&ARM_Ret, true); } } -JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... { @@ -575,7 +541,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; // CPSR might have been modified in a previous block - CPSRDirty = Config::JIT_BrancheOptimisations == 2; + CPSRDirty = false; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); @@ -685,31 +651,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F RegCache.Flush(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 - && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) - && (!instrs[instrsCount - 1].Info.Branches() - || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken - || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)ARM_Ret, true); - } + JMP((u8*)ARM_Ret, true); /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -720,22 +662,6 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F return res; } -void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - JMP((u8*)entry, true); - SetCodePtr(curPtr); -} - -void Compiler::UnlinkBlock(u32 offset) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - NOP(5); - SetCodePtr(curPtr); -} - void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index f2fc301..09ac257 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -52,10 +52,7 @@ public: void Reset(); - void LinkBlock(u32 offset, JitBlockEntry entry); - void UnlinkBlock(u32 offset); - - JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -202,6 +199,10 @@ public: SetCodePtr(FarCode); } + bool IsJITFault(u64 addr); + + s32 RewriteMemAccess(u64 pc); + u8* FarCode; u8* NearCode; u32 FarSize; @@ -216,8 +217,6 @@ public: bool Exit; bool IrregularCycles; - void* BranchStub[2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index cf0bd23..0bf2f83 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -15,6 +15,11 @@ int squeezePointer(T* ptr) return truncated; } +s32 Compiler::RewriteMemAccess(u64 pc) +{ + return 0; +} + /* According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) of all memory load and store instructions always access addresses in the same region as @@ -27,14 +32,15 @@ int squeezePointer(T* ptr) bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); + return false; + //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); if (invalidLiteralIdx != -1) { InvalidLiterals.Remove(invalidLiteralIdx); return false; - } + }*/ u32 val; // make sure arm7 bios is accessible @@ -95,7 +101,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - if (!addrIsStatic) + if (true) { OpArg rnMapped = MapReg(rn); if (Thumb && rn == 15) @@ -145,7 +151,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOV(32, rnMapped, R(finalAddr)); } - int expectedTarget = Num == 0 + /*int expectedTarget = Num == 0 ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); if (CurInstr.Cond() < 0xE) @@ -184,8 +190,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (addrIsStatic && compileSlowPath) MOV(32, R(RSCRATCH3), Imm32(staticAddress)); - - if (compileFastPath) +*/ + /*if (compileFastPath) { FixupBranch slowPath; if (compileSlowPath) @@ -357,15 +363,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz SetJumpTarget(slowPath); } } - - if (compileSlowPath) +*/ + if (true) { PushRegs(false); if (Num == 0) { - MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); - MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); if (flags & memop_Store) { MOV(32, R(ABI_PARAM3), rdMapped); @@ -423,13 +430,13 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); } } - +/* if (compileFastPath && compileSlowPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!(flags & memop_Store) && rd == 15) { @@ -458,7 +465,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc u32 stackAlloc = ((regsCount + 1) & ~1) * 8; #endif u32 allocOffset = stackAlloc - regsCount * 8; - +/* int expectedTarget = Num == 0 ? ClassifyAddress9(CurInstr.DataRegion) : ClassifyAddress7(CurInstr.DataRegion); @@ -479,7 +486,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc default: break; } - +*/ if (!store) Comp_AddCycles_CDI(); else @@ -492,7 +499,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else MOV(32, R(RSCRATCH4), MapReg(rn)); - +/* if (compileFastPath) { assert(!usermode); @@ -570,7 +577,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SwitchToFarCode(); SetJumpTarget(slowPath); - } + }*/ if (!store) { @@ -696,13 +703,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc PopRegs(false); } - +/* if (compileFastPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!store && regs[15]) { diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index b50e821..ccec951 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -206,15 +206,14 @@ enum { T_ReadR14 = 1 << 13, T_WriteR14 = 1 << 14, - T_PopPC = 1 << 15, - - T_SetNZ = 1 << 16, - T_SetCV = 1 << 17, - T_SetMaybeC = 1 << 18, - T_ReadC = 1 << 19, - T_SetC = 1 << 20, + T_SetNZ = 1 << 15, + T_SetCV = 1 << 16, + T_SetMaybeC = 1 << 17, + T_ReadC = 1 << 18, + T_SetC = 1 << 19, - T_WriteMem = 1 << 21, + T_WriteMem = 1 << 20, + T_LoadMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -256,31 +255,31 @@ const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); -const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); +const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL); const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); -const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); -const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); +const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG); +const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG); const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); -const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); -const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); -const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); +const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG); +const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG); +const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG); const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); -const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); +const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM); const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); -const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); +const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM); const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); -const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); +const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM); const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); -const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); +const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL); const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); -const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); +const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP); -const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); +const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA); const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); @@ -347,7 +346,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_BranchAlways) res.DstRegs |= (1 << 15); - if (data & T_PopPC && instr & (1 << 8)) + if (res.Kind == tk_POP && instr & (1 << 8)) res.DstRegs |= 1 << 15; if (data & T_SetNZ) @@ -364,11 +363,18 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_WriteMem) res.SpecialKind = special_WriteMem; - if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) + if (data & T_LoadMem) { - if (!Config::JIT_LiteralOptimisations) - res.SrcRegs |= 1 << 15; - res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDR_PCREL) + { + if (!Config::JIT_LiteralOptimisations) + res.SrcRegs |= 1 << 15; + res.SpecialKind = special_LoadLiteral; + } + else + { + res.SpecialKind = special_LoadMem; + } } if (res.Kind == tk_LDMIA || res.Kind == tk_POP) @@ -401,11 +407,17 @@ Info Decode(bool thumb, u32 num, u32 instr) else if ((instr >> 28) == 0xF) data = ak(ak_Nop); - if (data & A_UnkOnARM7 && num != 0) + if (data & A_UnkOnARM7 && num == 1) data = A_UNK; res.Kind = (data >> 22) & 0x1FF; + if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1) + { + data = ak(ak_Nop); + res.Kind = ak_Nop; + } + if (res.Kind == ak_MCR) { u32 cn = (instr >> 16) & 0xF; @@ -490,8 +502,13 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_WriteMem) res.SpecialKind = special_WriteMem; - if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) - res.SpecialKind = special_LoadLiteral; + if (data & A_LoadMem) + { + if (res.SrcRegs == (1 << 15)) + res.SpecialKind = special_LoadLiteral; + else + res.SpecialKind = special_LoadMem; + } if (res.Kind == ak_LDM) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 6ab4929..a702435 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -232,6 +232,7 @@ enum { special_NotSpecialAtAll = 0, special_WriteMem, + special_LoadMem, special_WaitForInterrupt, special_LoadLiteral }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f35b3e9..84bbc2b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,9 +55,11 @@ if (ENABLE_JIT) enable_language(ASM) target_sources(core PRIVATE - ARMJIT.cpp ARM_InstrInfo.cpp + ARMJIT.cpp + ARMJIT_Memory.cpp + dolphin/CommonFuncs.cpp ) @@ -85,6 +87,8 @@ if (ENABLE_JIT) ARMJIT_A64/ARMJIT_ALU.cpp ARMJIT_A64/ARMJIT_LoadStore.cpp ARMJIT_A64/ARMJIT_Branch.cpp + + ARMJIT_A64/ARMJIT_Linkage.s ) endif() endif() diff --git a/src/CP15.cpp b/src/CP15.cpp index 225847e..3d64259 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -22,6 +22,7 @@ #include "DSi.h" #include "ARM.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // access timing for cached regions @@ -42,8 +43,8 @@ void ARMv5::CP15Reset() DTCMSetting = 0; ITCMSetting = 0; - memset(ITCM, 0, 0x8000); - memset(DTCM, 0, 0x4000); + memset(ITCM, 0, ITCMPhysicalSize); + memset(DTCM, 0, DTCMPhysicalSize); ITCMSize = 0; DTCMBase = 0xFFFFFFFF; @@ -75,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&DTCMSetting); file->Var32(&ITCMSetting); - file->VarArray(ITCM, 0x8000); - file->VarArray(DTCM, 0x4000); + file->VarArray(ITCM, ITCMPhysicalSize); + file->VarArray(DTCM, DTCMPhysicalSize); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -98,36 +99,30 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { -#ifdef JIT_ENABLED - u32 oldDTCMBase = DTCMBase; - u32 oldDTCMSize = DTCMSize; -#endif + u32 newDTCMBase; + u32 newDTCMSize; if (CP15Control & (1<<16)) { - DTCMBase = DTCMSetting & 0xFFFFF000; - DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); + newDTCMBase = DTCMSetting & 0xFFFFF000; + newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize); } else { - DTCMBase = 0xFFFFFFFF; - DTCMSize = 0; + newDTCMBase = 0xFFFFFFFF; + newDTCMSize = 0; //printf("DTCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize) { - ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); - ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize); + DTCMBase = newDTCMBase; + DTCMSize = newDTCMSize; } -#endif } void ARMv5::UpdateITCMSetting() { -#ifdef JIT_ENABLED - u32 oldITCMSize = ITCMSize; -#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -138,10 +133,6 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldITCMSize != ITCMSize) - ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); -#endif } @@ -581,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: ICacheInvalidateAll(); + //Halt(255); return; case 0x751: ICacheInvalidateByAddr(val); + //Halt(255); return; case 0x752: printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + //Halt(255); return; @@ -723,7 +717,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { CodeCycles = 1; - return *(u32*)&ITCM[addr & 0x7FFF]; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } CodeCycles = RegionCodeCycles; @@ -750,13 +744,13 @@ void ARMv5::DataRead8(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u8*)&ITCM[addr & 0x7FFF]; + *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -773,13 +767,13 @@ void ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u16*)&ITCM[addr & 0x7FFF]; + *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -796,13 +790,13 @@ void ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -817,13 +811,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -838,16 +832,16 @@ void ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; - *(u8*)&ITCM[addr & 0x7FFF] = val; + *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -864,16 +858,16 @@ void ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; - *(u16*)&ITCM[addr & 0x7FFF] = val; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -890,16 +884,16 @@ void ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -914,16 +908,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } diff --git a/src/Config.cpp b/src/Config.cpp index 22e9c11..edf84f2 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -47,8 +47,9 @@ int JIT_LiteralOptimisations = true; #ifdef JIT_ENABLED int JIT_Enable = false; int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = 2; +int JIT_BrancheOptimisations = true; int JIT_LiteralOptimisations = true; +int JIT_FastMemory = true; #endif ConfigEntry ConfigFile[] = @@ -72,8 +73,9 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, + {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 31fa67a..7b19a4b 100644 --- a/src/Config.h +++ b/src/Config.h @@ -63,6 +63,7 @@ extern int JIT_Enable; extern int JIT_MaxBlockSize; extern int JIT_BrancheOptimisations; extern int JIT_LiteralOptimisations; +extern int JIT_FastMemory; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 657241f..3d65482 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -33,6 +33,7 @@ #include "AREngine.h" #include "Platform.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -94,17 +95,17 @@ u32 CPUStop; u8 ARM9BIOS[0x1000]; u8 ARM7BIOS[0x4000]; -u8 MainRAM[0x1000000]; +u8* MainRAM; u32 MainRAMMask; -u8 SharedWRAM[0x8000]; +u8* SharedWRAM; u8 WRAMCnt; -u8* SWRAM_ARM9; -u8* SWRAM_ARM7; -u32 SWRAM_ARM9Mask; -u32 SWRAM_ARM7Mask; -u8 ARM7WRAM[0x10000]; +// putting them together so they're always next to each other +MemRegion SWRAM_ARM9; +MemRegion SWRAM_ARM7; + +u8* ARM7WRAM; u16 ExMemCnt[2]; @@ -171,6 +172,10 @@ bool Init() #ifdef JIT_ENABLED ARMJIT::Init(); +#else + MainRAM = new u8[MainRAMSize]; + ARM7WRAM = new u8[ARM7WRAMSize]; + SharedWRAM = new u8[SharedWRAMSize]; #endif DMAs[0] = new DMA(0, 0); @@ -485,6 +490,10 @@ void Reset() printf("ARM7 BIOS loaded\n"); fclose(f); } + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif if (ConsoleType == 1) { @@ -510,7 +519,7 @@ void Reset() InitTimings(); - memset(MainRAM, 0, 0x1000000); + memset(MainRAM, 0, MainRAMMask + 1); memset(SharedWRAM, 0, 0x8000); memset(ARM7WRAM, 0, 0x10000); @@ -587,10 +596,6 @@ void Reset() } AREngine::Reset(); - -#ifdef JIT_ENABLED - ARMJIT::Reset(); -#endif } void Stop() @@ -705,7 +710,7 @@ bool DoSavestate(Savestate* file) file->VarArray(MainRAM, 0x400000); file->VarArray(SharedWRAM, 0x8000); - file->VarArray(ARM7WRAM, 0x10000); + file->VarArray(ARM7WRAM, ARM7WRAMSize); file->VarArray(ExMemCnt, 2*sizeof(u16)); file->VarArray(ROMSeed0, 2*8); @@ -1128,43 +1133,40 @@ void MapSharedWRAM(u8 val) if (val == WRAMCnt) return; + ARMJIT_Memory::RemapSWRAM(); + WRAMCnt = val; switch (WRAMCnt & 0x3) { case 0: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x7FFF; - SWRAM_ARM7 = NULL; - SWRAM_ARM7Mask = 0; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x7FFF; + SWRAM_ARM7.Mem = NULL; + SWRAM_ARM7.Mask = 0; break; case 1: - SWRAM_ARM9 = &SharedWRAM[0x4000]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 2: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0x4000]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 3: - SWRAM_ARM9 = NULL; - SWRAM_ARM9Mask = 0; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x7FFF; + SWRAM_ARM9.Mem = NULL; + SWRAM_ARM9.Mask = 0; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x7FFF; break; } - -#ifdef JIT_ENABLED - ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); - ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); -#endif } @@ -1835,12 +1837,12 @@ u8 ARM9Read8(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u8*)&MainRAM[addr & MainRAMMask]; + return *(u8*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1900,12 +1902,12 @@ u16 ARM9Read16(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u16*)&MainRAM[addr & MainRAMMask]; + return *(u16*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1968,9 +1970,9 @@ u32 ARM9Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -2026,7 +2028,7 @@ void ARM9Write8(u32 addr, u8 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2035,12 +2037,12 @@ void ARM9Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2085,7 +2087,7 @@ void ARM9Write16(u32 addr, u16 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2094,12 +2096,12 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2113,18 +2115,16 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC(addr, val); - return; + default: GPU::WriteVRAM_LCDC(addr, val); return; } case 0x07000000: @@ -2165,7 +2165,7 @@ void ARM9Write32(u32 addr, u32 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2174,12 +2174,12 @@ void ARM9Write32(u32 addr, u32 val) return ; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2193,18 +2193,16 @@ void ARM9Write32(u32 addr, u32 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC(addr, val); - return; + default: GPU::WriteVRAM_LCDC(addr, val); return; } case 0x07000000: @@ -2250,10 +2248,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) return true; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - region->Mem = SWRAM_ARM9; - region->Mask = SWRAM_ARM9Mask; + region->Mem = SWRAM_ARM9.Mem; + region->Mask = SWRAM_ARM9.Mask; return true; } break; @@ -2292,17 +2290,17 @@ u8 ARM7Read8(u32 addr) return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead8(addr); @@ -2352,17 +2350,17 @@ u16 ARM7Read16(u32 addr) return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead16(addr); @@ -2419,17 +2417,17 @@ u32 ARM7Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead32(addr); @@ -2474,7 +2472,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2483,28 +2481,28 @@ void ARM7Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2514,7 +2512,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2551,7 +2549,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2560,28 +2558,28 @@ void ARM7Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2599,7 +2597,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2638,7 +2636,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2647,28 +2645,28 @@ void ARM7Write32(u32 addr, u32 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2687,7 +2685,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2736,17 +2734,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region) // then access all the WRAM as one contiguous block starting at 0x037F8000 // this case needs a bit of a hack to cover // it's not really worth bothering anyway - if (!SWRAM_ARM7) + if (!SWRAM_ARM7.Mem) { region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } break; case 0x03800000: region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } diff --git a/src/NDS.h b/src/NDS.h index e9b56da..4b4f9a1 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -134,6 +134,7 @@ typedef struct } MemRegion; extern int ConsoleType; +extern int CurCPU; extern u8 ARM9MemTimings[0x40000][4]; extern u8 ARM7MemTimings[0x20000][4]; @@ -161,20 +162,20 @@ extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; extern u16 ARM7BIOSProt; -extern u8 MainRAM[0x1000000]; +extern u8* MainRAM; extern u32 MainRAMMask; -extern u8 SharedWRAM[0x8000]; -extern u8* SWRAM_ARM9; -extern u8* SWRAM_ARM7; -extern u32 SWRAM_ARM9Mask; -extern u32 SWRAM_ARM7Mask; - -extern u8 ARM7WRAM[0x10000]; +const u32 SharedWRAMSize = 0x8000; +extern u8* SharedWRAM; +extern MemRegion SWRAM_ARM9; +extern MemRegion SWRAM_ARM7; extern u32 KeyInput; +const u32 ARM7WRAMSize = 0x10000; +extern u8* ARM7WRAM; + bool Init(); void DeInit(); void Reset(); -- cgit v1.2.3 From ea6d03581b689738d0d1930b28d1588019cf4077 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 15 Jun 2020 15:51:19 +0200 Subject: make literal optimisation work again enable single register block load/store optimisations for x64 aswell --- src/ARMJIT_x64/ARMJIT_Compiler.h | 12 +++---- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 72 ++++++++++++++++++++++++------------- 2 files changed, 54 insertions(+), 30 deletions(-) (limited to 'src/ARMJIT_x64') diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 09ac257..d1a6c07 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -18,15 +18,15 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; const Gen::X64Reg RSCRATCH4 = Gen::R8; -struct ComplexOperand +struct Op2 { - ComplexOperand() + Op2() {} - ComplexOperand(u32 imm) + Op2(u32 imm) : IsImm(true), Imm(imm) {} - ComplexOperand(int reg, int op, int amount) + Op2(int reg, int op, int amount) : IsImm(false) { Reg.Reg = reg; @@ -135,9 +135,9 @@ public: memop_Store = 1 << 3, memop_SubtractOffset = 1 << 4 }; - void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); + void Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); - bool Comp_MemLoadLiteral(int size, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 0bf2f83..b780c55 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -30,17 +30,18 @@ s32 Compiler::RewriteMemAccess(u64 pc) improvement. */ -bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) { - return false; - //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); + u32 localAddr = LocaliseCodeAddress(Num, addr); - /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); if (invalidLiteralIdx != -1) { InvalidLiterals.Remove(invalidLiteralIdx); return false; - }*/ + } + + Comp_AddCycles_CDI(); u32 val; // make sure arm7 bios is accessible @@ -52,23 +53,29 @@ bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) val = ROR(val, (addr & 0x3) << 3); } else if (size == 16) + { CurCPU->DataRead16(addr & ~0x1, &val); + if (signExtend) + val = ((s32)val << 16) >> 16; + } else + { CurCPU->DataRead8(addr, &val); + if (signExtend) + val = ((s32)val << 24) >> 24; + } CurCPU->R[15] = tmpR15; MOV(32, MapReg(rd), Imm32(val)); if (Thumb || CurInstr.Cond() == 0xE) RegCache.PutLiteral(rd, val); - - Comp_AddCycles_CDI(); - + return true; } -void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) +void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flags) { u32 addressMask = ~0; if (size == 32) @@ -76,11 +83,11 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - if (Comp_MemLoadLiteral(size, rd, addr)) + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) return; } @@ -455,6 +462,23 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { int regsCount = regs.Count(); + if (regsCount == 0) + return 0; // actually not the right behaviour TODO: fix me + + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) + { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + s32 offset = (regsCount * 4) * (decrement ? -1 : 1); // we need to make sure that the stack stays aligned to 16 bytes @@ -743,10 +767,10 @@ void Compiler::A_Comp_MemWB() if (!(CurInstr.Instr & (1 << 23))) flags |= memop_SubtractOffset; - ComplexOperand offset; + Op2 offset; if (!(CurInstr.Instr & (1 << 25))) { - offset = ComplexOperand(CurInstr.Instr & 0xFFF); + offset = Op2(CurInstr.Instr & 0xFFF); } else { @@ -754,7 +778,7 @@ void Compiler::A_Comp_MemWB() int amount = (CurInstr.Instr >> 7) & 0x1F; int rm = CurInstr.A_Reg(0); - offset = ComplexOperand(rm, op, amount); + offset = Op2(rm, op, amount); } Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); @@ -762,9 +786,9 @@ void Compiler::A_Comp_MemWB() void Compiler::A_Comp_MemHalf() { - ComplexOperand offset = CurInstr.Instr & (1 << 22) - ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : ComplexOperand(CurInstr.A_Reg(0), 0, 0); + Op2 offset = CurInstr.Instr & (1 << 22) + ? Op2(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : Op2(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -806,7 +830,7 @@ void Compiler::T_Comp_MemReg() bool load = op & 0x2; bool byte = op & 0x1; - Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), byte ? 8 : 32, load ? 0 : memop_Store); } @@ -839,7 +863,7 @@ void Compiler::T_Comp_MemImm() bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), byte ? 8 : 32, load ? 0 : memop_Store); } @@ -856,7 +880,7 @@ void Compiler::T_Comp_MemRegHalf() if (!load) flags |= memop_Store; - Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(CurInstr.T_Reg(6), 0, 0), size, flags); } @@ -865,7 +889,7 @@ void Compiler::T_Comp_MemImmHalf() u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16, load ? 0 : memop_Store); } @@ -873,8 +897,8 @@ void Compiler::T_Comp_LoadPCRel() { u32 offset = (CurInstr.Instr & 0xFF) << 2; u32 addr = (R15 & ~0x2) + offset; - if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr)) - Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() @@ -882,7 +906,7 @@ void Compiler::T_Comp_MemSPRel() u32 offset = (CurInstr.Instr & 0xFF) * 4; bool load = CurInstr.Instr & (1 << 11); - Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store); } -- cgit v1.2.3 From c5381d2911d47fb1fcbd6ec27a83f5da3606c4bd Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 30 Jun 2020 23:50:41 +0200 Subject: reconcile DSi and JIT, fastmem for x64 and Windows --- src/ARM.cpp | 23 +- src/ARM.h | 2 +- src/ARMJIT.cpp | 273 +-- src/ARMJIT.h | 2 + src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 4 +- src/ARMJIT_Internal.h | 12 +- src/ARMJIT_Memory.cpp | 636 ++++-- src/ARMJIT_Memory.h | 16 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 109 + src/ARMJIT_x64/ARMJIT_Compiler.h | 14 + src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 632 +++--- src/CP15.cpp | 21 + src/Config.cpp | 20 +- src/Config.h | 9 +- src/DSi.cpp | 167 +- src/DSi.h | 15 + src/DSi_I2C.cpp | 4 +- src/NDS.cpp | 41 +- src/NDS.h | 2 + src/frontend/qt_sdl/EmuSettingsDialog.cpp | 115 +- src/frontend/qt_sdl/EmuSettingsDialog.h | 5 +- src/frontend/qt_sdl/EmuSettingsDialog.ui | 598 +++--- src/frontend/qt_sdl/main.cpp | 9 +- src/frontend/qt_sdl/main.h | 1 + src/libui_sdl/DlgEmuSettings.cpp | 252 --- src/libui_sdl/libui/ui.h | 764 ------- src/libui_sdl/libui/unix/stddialogs.c | 126 -- src/libui_sdl/libui/windows/stddialogs.cpp | 180 -- src/libui_sdl/main.cpp | 3061 ---------------------------- 29 files changed, 1656 insertions(+), 5457 deletions(-) delete mode 100644 src/libui_sdl/DlgEmuSettings.cpp delete mode 100644 src/libui_sdl/libui/ui.h delete mode 100644 src/libui_sdl/libui/unix/stddialogs.c delete mode 100644 src/libui_sdl/libui/windows/stddialogs.cpp delete mode 100644 src/libui_sdl/main.cpp (limited to 'src/ARMJIT_x64') diff --git a/src/ARM.cpp b/src/ARM.cpp index e529be8..8530795 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,12 +21,15 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" -#include "ARMJIT.h" #include "Config.h" #include "AREngine.h" #include "ARMJIT.h" #include "Config.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif // instruction timing notes // @@ -109,6 +112,12 @@ void ARM::Reset() CodeMem.Mem = NULL; +#ifdef JIT_ENABLED + FastBlockLookup = NULL; + FastBlockLookupStart = 0; + FastBlockLookupSize = 0; +#endif + // zorp JumpTo(ExceptionBase); } @@ -752,6 +761,12 @@ void ARMv4::Execute() if (Halted == 2) Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } } #ifdef JIT_ENABLED @@ -820,6 +835,12 @@ void ARMv4::ExecuteJIT() if (Halted == 2) Halted = 0; + + if (Halted == 4) + { + DSi::SoftReset(); + Halted = 2; + } } #endif diff --git a/src/ARM.h b/src/ARM.h index b7f16d6..0248e26 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -147,7 +147,7 @@ public: NDS::MemRegion CodeMem; #ifdef JIT_ENABLED - u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0; + u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; #endif diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 53b28c1..2a61c38 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -18,6 +18,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "DSi.h" #include "GPU.h" #include "GPU3D.h" #include "SPU.h" @@ -38,25 +39,35 @@ namespace ARMJIT Compiler* JITCompiler; AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; -AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMMaxSize / 512]; AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; AddressRange CodeIndexVRAM[0x100000 / 512]; AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; +AddressRange CodeIndexBIOS9DSi[0x10000 / 512]; +AddressRange CodeIndexBIOS7DSi[0x10000 / 512]; +AddressRange CodeIndexNWRAM_A[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_B[DSi::NWRAMSize / 512]; +AddressRange CodeIndexNWRAM_C[DSi::NWRAMSize / 512]; std::unordered_map JitBlocks9; std::unordered_map JitBlocks7; u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; -u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMMaxSize / 2]; u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; u64 FastBlockLookupVRAM[0x100000 / 2]; u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; +u64 FastBlockLookupBIOS9DSi[0x10000 / 2]; +u64 FastBlockLookupBIOS7DSi[0x10000 / 2]; +u64 FastBlockLookupNWRAM_A[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_B[DSi::NWRAMSize / 2]; +u64 FastBlockLookupNWRAM_C[DSi::NWRAMSize / 2]; const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { @@ -64,7 +75,7 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = ITCMPhysicalSize, 0, sizeof(NDS::ARM9BIOS), - NDS::MainRAMSize, + NDS::MainRAMMaxSize, NDS::SharedWRAMSize, 0, 0x100000, @@ -73,6 +84,11 @@ const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = 0, 0, 0x40000, + 0x10000, + 0x10000, + sizeof(DSi::NWRAM_A), + sizeof(DSi::NWRAM_B), + sizeof(DSi::NWRAM_C), }; AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = @@ -90,6 +106,11 @@ AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = NULL, NULL, CodeIndexARM7WVRAM, + CodeIndexBIOS9DSi, + CodeIndexBIOS7DSi, + CodeIndexNWRAM_A, + CodeIndexNWRAM_B, + CodeIndexNWRAM_C }; u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = @@ -106,7 +127,12 @@ u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = FastBlockLookupARM7WRAM, NULL, NULL, - FastBlockLookupARM7WVRAM + FastBlockLookupARM7WVRAM, + FastBlockLookupBIOS9DSi, + FastBlockLookupBIOS7DSi, + FastBlockLookupNWRAM_A, + FastBlockLookupNWRAM_B, + FastBlockLookupNWRAM_C }; u32 LocaliseCodeAddress(u32 num, u32 addr) @@ -115,21 +141,14 @@ u32 LocaliseCodeAddress(u32 num, u32 addr) ? ARMJIT_Memory::ClassifyAddress9(addr) : ARMJIT_Memory::ClassifyAddress7(addr); - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, - mappingSize, memoryOffset, memorySize) - && CodeMemRegions[region]) - { - addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; - addr |= (u32)region << 28; - return addr; - } + if (CodeMemRegions[region]) + return ARMJIT_Memory::LocaliseAddress(region, num, addr); return 0; } TinyVector InvalidLiterals; -template +template T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; @@ -141,11 +160,11 @@ T SlowRead9(u32 addr, ARMv5* cpu) else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; else if (std::is_same::value) - val = NDS::ARM9Read32(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read32 : DSi::ARM9Read32)(addr); else if (std::is_same::value) - val = NDS::ARM9Read16(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read16 : DSi::ARM9Read16)(addr); else - val = NDS::ARM9Read8(addr); + val = (ConsoleType == 0 ? NDS::ARM9Read8 : DSi::ARM9Read8)(addr); if (std::is_same::value) return ROR(val, offset << 3); @@ -153,7 +172,7 @@ T SlowRead9(u32 addr, ARMv5* cpu) return val; } -template +template void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); @@ -169,27 +188,19 @@ void SlowWrite9(u32 addr, ARMv5* cpu, T val) } else if (std::is_same::value) { - NDS::ARM9Write32(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write32 : DSi::ARM9Write32)(addr, val); } else if (std::is_same::value) { - NDS::ARM9Write16(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write16 : DSi::ARM9Write16)(addr, val); } else { - NDS::ARM9Write8(addr, val); + (ConsoleType == 0 ? NDS::ARM9Write8 : DSi::ARM9Write8)(addr, val); } } -template void SlowWrite9(u32, ARMv5*, u32); -template void SlowWrite9(u32, ARMv5*, u16); -template void SlowWrite9(u32, ARMv5*, u8); - -template u32 SlowRead9(u32, ARMv5*); -template u16 SlowRead9(u32, ARMv5*); -template u8 SlowRead9(u32, ARMv5*); - -template +template T SlowRead7(u32 addr) { u32 offset = addr & 0x3; @@ -197,11 +208,11 @@ T SlowRead7(u32 addr) T val; if (std::is_same::value) - val = NDS::ARM7Read32(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read32 : DSi::ARM7Read32)(addr); else if (std::is_same::value) - val = NDS::ARM7Read16(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read16 : DSi::ARM7Read16)(addr); else - val = NDS::ARM7Read8(addr); + val = (ConsoleType == 0 ? NDS::ARM7Read8 : DSi::ARM7Read8)(addr); if (std::is_same::value) return ROR(val, offset << 3); @@ -209,67 +220,71 @@ T SlowRead7(u32 addr) return val; } -template +template void SlowWrite7(u32 addr, T val) { addr &= ~(sizeof(T) - 1); if (std::is_same::value) - NDS::ARM7Write32(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write32 : DSi::ARM7Write32)(addr, val); else if (std::is_same::value) - NDS::ARM7Write16(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write16 : DSi::ARM7Write16)(addr, val); else - NDS::ARM7Write8(addr, val); + (ConsoleType == 0 ? NDS::ARM7Write8 : DSi::ARM7Write8)(addr, val); } -template +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; - if (PreInc) - addr += 4; for (int i = 0; i < num; i++) { if (Write) - SlowWrite9(addr, cpu, data[i]); + SlowWrite9(addr, cpu, data[i]); else - data[i] = SlowRead9(addr, cpu); + data[i] = SlowRead9(addr, cpu); addr += 4; } } -template +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; - if (PreInc) - addr += 4; for (int i = 0; i < num; i++) { if (Write) - SlowWrite7(addr, data[i]); + SlowWrite7(addr, data[i]); else - data[i] = SlowRead7(addr); + data[i] = SlowRead7(addr); addr += 4; } } -template void SlowWrite7(u32, u32); -template void SlowWrite7(u32, u16); -template void SlowWrite7(u32, u8); - -template u32 SlowRead7(u32); -template u16 SlowRead7(u32); -template u8 SlowRead7(u32); - -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +#define INSTANTIATE_SLOWMEM(consoleType) \ + template void SlowWrite9(u32, ARMv5*, u32); \ + template void SlowWrite9(u32, ARMv5*, u16); \ + template void SlowWrite9(u32, ARMv5*, u8); \ + \ + template u32 SlowRead9(u32, ARMv5*); \ + template u16 SlowRead9(u32, ARMv5*); \ + template u8 SlowRead9(u32, ARMv5*); \ + \ + template void SlowWrite7(u32, u32); \ + template void SlowWrite7(u32, u16); \ + template void SlowWrite7(u32, u8); \ + \ + template u32 SlowRead7(u32); \ + template u16 SlowRead7(u32); \ + template u8 SlowRead7(u32); \ + \ + template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); \ + template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); \ + template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); \ + +INSTANTIATE_SLOWMEM(0) +INSTANTIATE_SLOWMEM(1) template struct UnreliableHashTable @@ -616,6 +631,12 @@ void CompileBlock(ARM* cpu) u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; auto existingBlockIt = map.find(blockAddr); if (existingBlockIt != map.end()) @@ -623,18 +644,24 @@ void CompileBlock(ARM* cpu) // there's already a block, though it's not inside the fast map // could be that there are two blocks at the same physical addr // but different mirrors - u32 localAddr = existingBlockIt->second->StartAddrLocal; + u32 otherLocalAddr = existingBlockIt->second->StartAddrLocal; - u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; - *entry = ((u64)blockAddr | cpu->Num) << 32; - *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); - return; - } + if (localAddr == otherLocalAddr) + { + JIT_DEBUGPRINT("switching out block %x %x %x\n", localAddr, blockAddr, existingBlockIt->second->StartAddr); - u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); - if (!localAddr) - { - printf("trying to compile non executable code? %x\n", blockAddr); + u64* entry = &FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + // some memory has been remapped + JitBlock* prevBlock = RestoreCandidates.Insert(existingBlockIt->second->InstrHash, existingBlockIt->second); + if (prevBlock) + delete prevBlock; + + map.erase(existingBlockIt); } FetchedInstr instrs[Config::JIT_MaxBlockSize]; @@ -655,7 +682,7 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -678,7 +705,7 @@ void CompileBlock(ARM* cpu) instrValues[i] = instrs[i].Instr; u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); - assert(translatedAddr); + assert(translatedAddr >> 27); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { @@ -727,7 +754,10 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; - if (instrs[i].Info.DstRegs & (1 << 14)) + if (instrs[i].Info.DstRegs & (1 << 14) + || (!thumb + && (instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_MSR_REG) + && instrs[i].Instr & (1 << 16))) hasLink = false; if (thumb) @@ -792,7 +822,7 @@ void CompileBlock(ARM* cpu) i--; } - if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + if (instrs[i].Info.Branches() && Config::JIT_BranchOptimisations) { bool hasBranched = cpu->R[15] != r15; @@ -830,8 +860,6 @@ void CompileBlock(ARM* cpu) } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { - u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); - if (link) { lr = linkAddr; @@ -927,6 +955,8 @@ void CompileBlock(ARM* cpu) FloodFillSetFlags(instrs, i - 1, 0xF); block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); + + JIT_DEBUGPRINT("block start %p\n", block->EntryPoint); } else { @@ -940,12 +970,12 @@ void CompileBlock(ARM* cpu) assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); - AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; + AddressRange* region = CodeMemRegions[addressRanges[j] >> 27]; - if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) - ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); + if (!PageContainsCode(®ion[(addressRanges[j] & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 27, addressRanges[j] & 0x7FFFFFF, true); - AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; + AddressRange* range = ®ion[(addressRanges[j] & 0x7FFFFFF) / 512]; range->Code |= addressMasks[j]; range->Blocks.Add(block); } @@ -955,7 +985,7 @@ void CompileBlock(ARM* cpu) else JitBlocks7[blockAddr] = block; - u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; + u64* entry = &FastBlockLookupRegions[(localAddr >> 27)][(localAddr & 0x7FFFFFF) / 2]; *entry = ((u64)blockAddr | cpu->Num) << 32; *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } @@ -964,8 +994,8 @@ void InvalidateByAddr(u32 localAddr) { JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); - AddressRange* region = CodeMemRegions[localAddr >> 28]; - AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; + AddressRange* region = CodeMemRegions[localAddr >> 27]; + AddressRange* range = ®ion[(localAddr & 0x7FFFFFF) / 512]; u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; @@ -994,9 +1024,9 @@ void InvalidateByAddr(u32 localAddr) range->Blocks.Remove(i); if (range->Blocks.Length == 0 - && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) + && !PageContainsCode(®ion[(localAddr & 0x7FFF000) / 512])) { - ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); + ARMJIT_Memory::SetCodeProtection(localAddr >> 27, localAddr & 0x7FFFFFF, false); } bool literalInvalidation = false; @@ -1019,8 +1049,8 @@ void InvalidateByAddr(u32 localAddr) u32 addr = block->AddressRanges()[j]; if ((addr / 512) != (localAddr / 512)) { - AddressRange* otherRegion = CodeMemRegions[addr >> 28]; - AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; + AddressRange* otherRegion = CodeMemRegions[addr >> 27]; + AddressRange* otherRange = &otherRegion[(addr & 0x7FFFFFF) / 512]; assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); @@ -1028,15 +1058,15 @@ void InvalidateByAddr(u32 localAddr) if (otherRange->Blocks.Length == 0) { - if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) - ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); + if (!PageContainsCode(&otherRegion[(addr & 0x7FFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 27, addr & 0x7FFFFFF, false); otherRange->Code = 0; } } } - FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; + FastBlockLookupRegions[block->StartAddrLocal >> 27][(block->StartAddrLocal & 0x7FFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) JitBlocks9.erase(block->StartAddr); else @@ -1055,19 +1085,25 @@ void InvalidateByAddr(u32 localAddr) } } -template -void CheckAndInvalidate(u32 addr) +void CheckAndInvalidateITCM() { - // let's hope this gets all properly inlined - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) + for (u32 i = 0; i < ITCMPhysicalSize; i+=16) { - u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; - if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) - InvalidateByAddr(localAddr | (region << 28)); + if (CodeIndexITCM[i / 512].Code & (1 << ((i & 0x1FF) / 16))) + { + InvalidateByAddr(i | (ARMJIT_Memory::memregion_ITCM << 27)); + } } } +template +void CheckAndInvalidate(u32 addr) +{ + u32 localAddr = ARMJIT_Memory::LocaliseAddress(region, num, addr); + if (CodeMemRegions[region][(localAddr & 0x7FFFFFF) / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr); +} + JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) { u64* entry = &entries[offset / 2]; @@ -1076,35 +1112,44 @@ JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) return NULL; } +void blockSanityCheck(u32 num, u32 blockAddr, JitBlockEntry entry) +{ + u32 localAddr = LocaliseCodeAddress(num, blockAddr); + assert(JITCompiler->AddEntryOffset((u32)FastBlockLookupRegions[localAddr >> 27][(localAddr & 0x7FFFFFF) / 2]) == entry); +} + bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { + // amazingly ignoring the DTCM is the proper behaviour for code fetches int region = num == 0 ? ARMJIT_Memory::ClassifyAddress9(blockAddr) : ARMJIT_Memory::ClassifyAddress7(blockAddr); - u32 mappingStart, mappingSize, memoryOffset, memorySize; - if (CodeMemRegions[region] - && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, - mappingSize, memoryOffset, memorySize)) + u32 memoryOffset; + if (FastBlockLookupRegions[region] + && ARMJIT_Memory::GetMirrorLocation(region, num, blockAddr, memoryOffset, start, size)) { + //printf("setup exec region %d %d %08x %08x %x %x\n", num, region, blockAddr, start, size, memoryOffset); entry = FastBlockLookupRegions[region] + memoryOffset / 2; - // evil, though it should work for everything except DTCM which is not relevant here - start = blockAddr & ~(memorySize - 1); - size = memorySize; return true; } - else - return false; + return false; } template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); -template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); -template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(u32); void ResetBlockCache() { @@ -1133,7 +1178,7 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } @@ -1145,7 +1190,7 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + AddressRange* range = &CodeMemRegions[addr >> 27][(addr & 0x7FFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 2320b7b..04add59 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -16,6 +16,8 @@ void DeInit(); void Reset(); +void CheckAndInvalidateITCM(); + void InvalidateByAddr(u32 pseudoPhysical); template diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index b307d0e..c1b23a7 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -168,7 +168,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget))) + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) { ptrdiff_t memopStart = GetCodeOffset(); LoadStorePatch patch; @@ -461,7 +461,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); bool compileFastPath = Config::JIT_FastMemory - && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget)); + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); if (decrement) { diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 19684c4..c87e1b3 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -214,13 +214,13 @@ u32 LocaliseCodeAddress(u32 num, u32 addr); template void LinkBlock(ARM* cpu, u32 codeOffset); -template T SlowRead9(u32 addr, ARMv5* cpu); -template void SlowWrite9(u32 addr, ARMv5* cpu, T val); -template T SlowRead7(u32 addr); -template void SlowWrite7(u32 addr, T val); +template T SlowRead9(u32 addr, ARMv5* cpu); +template void SlowWrite9(u32 addr, ARMv5* cpu, T val); +template T SlowRead7(u32 addr); +template void SlowWrite7(u32 addr, T val); -template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); -template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); } diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp index 162827d..0276c65 100644 --- a/src/ARMJIT_Memory.cpp +++ b/src/ARMJIT_Memory.cpp @@ -1,5 +1,7 @@ -#ifdef __SWITCH__ +#if defined(__SWITCH__) #include "switch/compat_switch.h" +#elif defined(_WIN32) +#include #endif #include "ARMJIT_Memory.h" @@ -7,6 +9,7 @@ #include "ARMJIT_Internal.h" #include "ARMJIT_Compiler.h" +#include "DSi.h" #include "GPU.h" #include "GPU3D.h" #include "Wifi.h" @@ -37,66 +40,24 @@ namespace ARMJIT_Memory { -#ifdef __aarch64__ -struct FaultDescription -{ - u64 IntegerRegisters[33]; - u64 FaultAddr; - - u32 GetEmulatedAddr() - { - // now this is podracing - return (u32)IntegerRegisters[0]; - } - u64 RealAddr() - { - return FaultAddr; - } - - u64 GetPC() - { - return IntegerRegisters[32]; - } - - void RestoreAndRepeat(s64 offset); -}; -#else struct FaultDescription { - u64 GetPC() - { - return 0; - } - - u32 GetEmulatedAddr() - { - return 0; - } - u64 RealAddr() - { - return 0; - } - - void RestoreAndRepeat(s64 offset); + u32 EmulatedFaultAddr; + u64 FaultPC; }; -#endif -void FaultHandler(FaultDescription* faultDesc); +bool FaultHandler(FaultDescription* faultDesc, s32& offset); } - -#ifdef __aarch64__ - -extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); - -#endif - -#ifdef __SWITCH__ +#if defined(__SWITCH__) // with LTO the symbols seem to be not properly overriden // if they're somewhere else extern "C" { + +void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + extern char __start__; extern char __rodata_start; @@ -106,57 +67,85 @@ u64 __nx_exception_stack_size = 0x8000; void __libnx_exception_handler(ThreadExceptionDump* ctx) { ARMJIT_Memory::FaultDescription desc; - memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29); - desc.IntegerRegisters[29] = ctx->fp.x; - desc.IntegerRegisters[30] = ctx->lr.x; - desc.IntegerRegisters[31] = ctx->sp.x; - desc.IntegerRegisters[32] = ctx->pc.x; + desc.EmulatedFaultAddr = ctx->cpu_gprs[0].w; + desc.FaultPC = ctx->pc.x; + + u64 integerRegisters[33]; + memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29); + integerRegisters[29] = ctx->fp.x; + integerRegisters[30] = ctx->lr.x; + integerRegisters[31] = ctx->sp.x; + integerRegisters[32] = ctx->pc.x; + + s32 offset; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + integerRegisters[32] += offset; - ARMJIT_Memory::FaultHandler(&desc); + ARM_RestoreContext(integerRegisters); + } if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) { - printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); } else { - printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); } } } + +#elif defined(_WIN32) + +static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) +{ + if (exceptionInfo->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION) + return EXCEPTION_CONTINUE_SEARCH; + + ARMJIT_Memory::FaultDescription desc; + desc.EmulatedFaultAddr = exceptionInfo->ContextRecord->Rcx; + desc.FaultPC = exceptionInfo->ContextRecord->Rip; + + s32 offset = 0; + if (ARMJIT_Memory::FaultHandler(&desc, offset)) + { + exceptionInfo->ContextRecord->Rip += offset; + return EXCEPTION_CONTINUE_EXECUTION; + } + + return EXCEPTION_CONTINUE_SEARCH; +} + #endif namespace ARMJIT_Memory { -#ifdef __aarch64__ -void FaultDescription::RestoreAndRepeat(s64 offset) -{ - IntegerRegisters[32] += offset; +void* FastMem9Start, *FastMem7Start; - ARM_RestoreContext(IntegerRegisters); +#ifdef _WIN32 +inline u32 RoundUp(u32 size) +{ + return (size + 0xFFFF) & ~0xFFFF; } #else -void FaultDescription::RestoreAndRepeat(s64 offset) +inline u32 RoundUp(u32 size) { - + return size; } #endif -void* FastMem9Start, *FastMem7Start; - -const u32 MemoryTotalSize = - NDS::MainRAMSize - + NDS::SharedWRAMSize - + NDS::ARM7WRAMSize - + DTCMPhysicalSize; - const u32 MemBlockMainRAMOffset = 0; -const u32 MemBlockSWRAMOffset = NDS::MainRAMSize; -const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize; -const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize; +const u32 MemBlockSWRAMOffset = RoundUp(NDS::MainRAMMaxSize); +const u32 MemBlockARM7WRAMOffset = MemBlockSWRAMOffset + RoundUp(NDS::SharedWRAMSize); +const u32 MemBlockDTCMOffset = MemBlockARM7WRAMOffset + RoundUp(NDS::ARM7WRAMSize); +const u32 MemBlockNWRAM_AOffset = MemBlockDTCMOffset + RoundUp(DTCMPhysicalSize); +const u32 MemBlockNWRAM_BOffset = MemBlockNWRAM_AOffset + RoundUp(DSi::NWRAMSize); +const u32 MemBlockNWRAM_COffset = MemBlockNWRAM_BOffset + RoundUp(DSi::NWRAMSize); +const u32 MemoryTotalSize = MemBlockNWRAM_COffset + RoundUp(DSi::NWRAMSize); const u32 OffsetsPerRegion[memregions_Count] = { @@ -173,6 +162,11 @@ const u32 OffsetsPerRegion[memregions_Count] = UINT32_MAX, UINT32_MAX, UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockNWRAM_AOffset, + MemBlockNWRAM_BOffset, + MemBlockNWRAM_COffset }; enum @@ -186,11 +180,13 @@ enum u8 MappingStatus9[1 << (32-12)]; u8 MappingStatus7[1 << (32-12)]; -#ifdef __SWITCH__ +#if defined(__SWITCH__) u8* MemoryBase; u8* MemoryBaseCodeMem; -#else +#elif defined(_WIN32) u8* MemoryBase; +HANDLE MemoryFile; +LPVOID ExceptionHandlerHandle; #endif bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) @@ -200,6 +196,9 @@ bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size)); return R_SUCCEEDED(r); +#elif defined(_WIN32) + bool r = MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, offset, size, dst) == dst; + return r; #endif } @@ -209,8 +208,24 @@ bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) #ifdef __SWITCH__ Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), (u64)(MemoryBaseCodeMem + offset), size); - printf("%x\n", r); return R_SUCCEEDED(r); +#else + return UnmapViewOfFile(dst); +#endif +} + +void SetCodeProtectionRange(u32 addr, u32 size, u32 num, int protection) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#if defined(_WIN32) + DWORD winProtection, oldProtection; + if (protection == 0) + winProtection = PAGE_NOACCESS; + else if (protection == 1) + winProtection = PAGE_READONLY; + else + winProtection = PAGE_READWRITE; + VirtualProtect(dst, size, winProtection, &oldProtection); #endif } @@ -230,7 +245,6 @@ struct Mapping if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) { offset += NDS::ARM9->DTCMSize; - printf("%x skip\n", NDS::ARM9->DTCMSize); } else { @@ -245,6 +259,7 @@ struct Mapping offset += 0x1000; } +#ifdef __SWITCH__ if (status == memstate_MappedRW) { u32 segmentSize = offset - segmentOffset; @@ -252,8 +267,12 @@ struct Mapping bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); assert(success); } +#endif } } +#if defined(_WIN32) + UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); +#endif } }; ARMJIT::TinyVector Mappings[memregions_Count]; @@ -268,6 +287,8 @@ void SetCodeProtection(int region, u32 offset, bool protect) Mapping& mapping = Mappings[region][i]; u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (offset < mapping.LocalOffset || offset >= mapping.LocalOffset + mapping.Size) + continue; if (mapping.Num == 0 && region != memregion_DTCM && effectiveAddr >= NDS::ARM9->DTCMBase @@ -276,16 +297,20 @@ void SetCodeProtection(int region, u32 offset, bool protect) u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); - printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num); + printf("%x %d %x %x %x %d\n", effectiveAddr, mapping.Num, mapping.Addr, mapping.LocalOffset, mapping.Size, states[effectiveAddr >> 12]); assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; +#if defined(__SWITCH__) bool success; if (protect) success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); else success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); assert(success); +#elif defined(_WIN32) + SetCodeProtectionRange(effectiveAddr, 0x1000, mapping.Num, protect ? 1 : 2); +#endif } } @@ -314,8 +339,8 @@ void RemapDTCM(u32 newBase, u32 newSize) printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); - bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end)); - bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end)); + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd < start); + bool newOverlap = newSize > 0 && !(newBase >= end || newEnd < start); if (mapping.Num == 0 && (oldOverlap || newOverlap)) { @@ -336,19 +361,50 @@ void RemapDTCM(u32 newBase, u32 newSize) Mappings[memregion_DTCM].Clear(); } +void RemapNWRAM(int num) +{ + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;) + { + Mapping& mapping = Mappings[memregion_SharedWRAM][i]; + if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size + || DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr)) + { + mapping.Unmap(memregion_SharedWRAM); + Mappings[memregion_SharedWRAM].Remove(i); + } + else + { + i++; + } + } + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + num].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + num][i].Unmap(memregion_NewSharedWRAM_A + num); + } + Mappings[memregion_NewSharedWRAM_A + num].Clear(); +} + void RemapSWRAM() { printf("remapping SWRAM\n"); - for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++) + for (int i = 0; i < Mappings[memregion_SharedWRAM].Length; i++) { - Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM); + Mappings[memregion_SharedWRAM][i].Unmap(memregion_SharedWRAM); } - Mappings[memregion_SWRAM].Clear(); + Mappings[memregion_SharedWRAM].Clear(); for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) { Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); } Mappings[memregion_WRAM7].Clear(); + for (int j = 0; j < 3; j++) + { + for (int i = 0; i < Mappings[memregion_NewSharedWRAM_A + j].Length; i++) + { + Mappings[memregion_NewSharedWRAM_A + j][i].Unmap(memregion_NewSharedWRAM_A + j); + } + Mappings[memregion_NewSharedWRAM_A + j].Clear(); + } } bool MapAtAddress(u32 addr) @@ -359,33 +415,36 @@ bool MapAtAddress(u32 addr) ? ClassifyAddress9(addr) : ClassifyAddress7(addr); - if (!IsMappable(region)) + if (!IsFastmemCompatible(region)) return false; - u32 mappingStart, mappingSize, memoryOffset, memorySize; - bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize); + return false; + u32 mirrorStart, mirrorSize, memoryOffset; + bool isMapped = GetMirrorLocation(region, num, addr, memoryOffset, mirrorStart, mirrorSize); if (!isMapped) return false; - // this calculation even works with DTCM - // which doesn't have to be aligned to it's own size - u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart; - u8* states = num == 0 ? MappingStatus9 : MappingStatus7; - printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset); + printf("trying to create mapping %x, %x %d %d\n", mirrorStart, mirrorSize, region, num); bool isExecutable = ARMJIT::CodeMemRegions[region]; - ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset; +#if defined(_WIN32) + bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); + assert(succeded); +#endif + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512; // this overcomplicated piece of code basically just finds whole pieces of code memory // which can be mapped u32 offset = 0; bool skipDTCM = num == 0 && region != memregion_DTCM; - while (offset < memorySize) + while (offset < mirrorSize) { if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) { + SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0); offset += NDS::ARM9->DTCMSize; } else @@ -393,7 +452,7 @@ bool MapAtAddress(u32 addr) u32 sectionOffset = offset; bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) - && offset < memorySize + && offset < mirrorSize && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) { assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); @@ -403,41 +462,49 @@ bool MapAtAddress(u32 addr) u32 sectionSize = offset - sectionOffset; +#if defined(__SWITCH__) if (!hasCode) { printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); assert(succeded); } +#elif defined(_WIN32) + if (hasCode) + { + SetCodeProtectionRange(mirrorStart + offset, sectionSize, num, 1); + } +#endif } } - Mapping mapping{mirrorStart, memorySize, memoryOffset, num}; + assert(num == 0 || num == 1); + Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num}; Mappings[region].Add(mapping); - printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1); + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1); return true; } -void FaultHandler(FaultDescription* faultDesc) +bool FaultHandler(FaultDescription* faultDesc, s32& offset) { - if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC())) + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC)) { bool rewriteToSlowPath = true; - u32 addr = faultDesc->GetEmulatedAddr(); + u32 addr = faultDesc->EmulatedFaultAddr; if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) - rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr()); + rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr); - s64 offset = 0; if (rewriteToSlowPath) { - offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC()); + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC); } - faultDesc->RestoreAndRepeat(offset); + return true; } + return false; } void Init() @@ -459,18 +526,34 @@ void Init() FastMem7Start = virtmemReserve(0x100000000); assert(FastMem7Start); - NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset; - NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset; - NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset; - NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset; -#else - MemoryBase = new u8[MemoryTotalSize]; + u8* basePtr = MemoryBaseCodeMem; +#elif defined(_WIN32) + ExceptionHandlerHandle = AddVectoredExceptionHandler(1, ExceptionHandler); + + MemoryFile = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, MemoryTotalSize, NULL); - NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset; - NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset; - NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset; - NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset; + MemoryBase = (u8*)VirtualAlloc(NULL, MemoryTotalSize, MEM_RESERVE, PAGE_READWRITE); + + FastMem9Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + FastMem7Start = VirtualAlloc(NULL, 0x100000000, MEM_RESERVE, PAGE_READWRITE); + + // only free them after they have all been reserved + // so they can't overlap + VirtualFree(MemoryBase, 0, MEM_RELEASE); + VirtualFree(FastMem9Start, 0, MEM_RELEASE); + VirtualFree(FastMem7Start, 0, MEM_RELEASE); + + MapViewOfFileEx(MemoryFile, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, MemoryTotalSize, MemoryBase); + + u8* basePtr = MemoryBase; #endif + NDS::MainRAM = basePtr + MemBlockMainRAMOffset; + NDS::SharedWRAM = basePtr + MemBlockSWRAMOffset; + NDS::ARM7WRAM = basePtr + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = basePtr + MemBlockDTCMOffset; + DSi::NWRAM_A = basePtr + MemBlockNWRAM_AOffset; + DSi::NWRAM_B = basePtr + MemBlockNWRAM_BOffset; + DSi::NWRAM_C = basePtr + MemBlockNWRAM_COffset; } void DeInit() @@ -482,8 +565,11 @@ void DeInit() svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); free(MemoryBase); -#else - delete[] MemoryBase; +#elif defined(_WIN32) + assert(UnmapViewOfFile(MemoryBase)); + CloseHandle(MemoryFile); + + RemoveVectoredExceptionHandler(ExceptionHandlerHandle); #endif } @@ -505,12 +591,23 @@ void Reset() printf("done resetting jit mem\n"); } -bool IsMappable(int region) +bool IsFastmemCompatible(int region) { +#ifdef _WIN32 + /* + TODO: with some hacks, the smaller shared WRAM regions + could be mapped in some occaisons as well + */ + if (region == memregion_DTCM + || region == memregion_SharedWRAM + || region == memregion_NewSharedWRAM_B + || region == memregion_NewSharedWRAM_C) + return false; +#endif return OffsetsPerRegion[region] != UINT32_MAX; } -bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize) +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize) { memoryOffset = 0; switch (region) @@ -518,137 +615,251 @@ bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, case memregion_ITCM: if (num == 0) { - mappingStart = 0; - mappingSize = NDS::ARM9->ITCMSize; - memorySize = ITCMPhysicalSize; + mirrorStart = addr & ~(ITCMPhysicalSize - 1); + mirrorSize = ITCMPhysicalSize; return true; } return false; - case memregion_DTCM: + case memregion_MainRAM: + mirrorStart = addr & ~NDS::MainRAMMask; + mirrorSize = NDS::MainRAMMask + 1; + return true; + case memregion_BIOS9: if (num == 0) { - mappingStart = NDS::ARM9->DTCMBase; - mappingSize = NDS::ARM9->DTCMSize; - memorySize = DTCMPhysicalSize; + mirrorStart = addr & ~0xFFF; + mirrorSize = 0x1000; return true; } return false; - case memregion_BIOS9: - if (num == 0) + case memregion_BIOS7: + if (num == 1) { - mappingStart = 0xFFFF0000; - mappingSize = 0x10000; - memorySize = 0x1000; + mirrorStart = 0; + mirrorSize = 0x4000; return true; } return false; - case memregion_MainRAM: - mappingStart = 0x2000000; - mappingSize = 0x1000000; - memorySize = NDS::MainRAMSize; - return true; - case memregion_SWRAM: - mappingStart = 0x3000000; + case memregion_SharedWRAM: if (num == 0 && NDS::SWRAM_ARM9.Mem) { - mappingSize = 0x1000000; + mirrorStart = addr & ~NDS::SWRAM_ARM9.Mask; + mirrorSize = NDS::SWRAM_ARM9.Mask + 1; memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; - memorySize = NDS::SWRAM_ARM9.Mask + 1; return true; } else if (num == 1 && NDS::SWRAM_ARM7.Mem) { - mappingSize = 0x800000; + mirrorStart = addr & ~NDS::SWRAM_ARM7.Mask; + mirrorSize = NDS::SWRAM_ARM7.Mask + 1; memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; - memorySize = NDS::SWRAM_ARM7.Mask + 1; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + mirrorStart = addr & ~(NDS::ARM7WRAMSize - 1); + mirrorSize = NDS::ARM7WRAMSize; return true; } return false; case memregion_VRAM: if (num == 0) { - // this is a gross simplification - // mostly to make code on vram working - // it doesn't take any of the actual VRAM mappings into account - mappingStart = 0x6000000; - mappingSize = 0x1000000; - memorySize = 0x100000; - return true; + mirrorStart = addr & ~0xFFFFF; + mirrorSize = 0x100000; } return false; - case memregion_BIOS7: + case memregion_VWRAM: if (num == 1) { - mappingStart = 0; - mappingSize = 0x4000; - memorySize = 0x4000; + mirrorStart = addr & ~0x3FFFF; + mirrorSize = 0x40000; return true; } return false; - case memregion_WRAM7: - if (num == 1) + case memregion_NewSharedWRAM_A: { - if (NDS::SWRAM_ARM7.Mem) + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) { - mappingStart = 0x3800000; - mappingSize = 0x800000; + memoryOffset = ptr - DSi::NWRAM_A; + mirrorStart = addr & ~0xFFFF; + mirrorSize = 0x10000; + return true; } - else + return false; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) { - mappingStart = 0x3000000; - mappingSize = 0x1000000; + memoryOffset = ptr - DSi::NWRAM_B; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; } - memorySize = NDS::ARM7WRAMSize; + return false; // zero filled memory + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + { + memoryOffset = ptr - DSi::NWRAM_C; + mirrorStart = addr & ~0x7FFF; + mirrorSize = 0x8000; + return true; + } + return false; // zero filled memory + } + case memregion_BIOS9DSi: + if (num == 0) + { + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<0) ? 0x8000 : 0x10000; return true; } return false; - case memregion_VWRAM: + case memregion_BIOS7DSi: if (num == 1) { - mappingStart = 0x6000000; - mappingSize = 0x1000000; - memorySize = 0x20000; + mirrorStart = addr & ~0xFFFF; + mirrorSize = DSi::SCFG_BIOS & (1<<8) ? 0x8000 : 0x10000; return true; } return false; default: - // for the JIT we don't are about the rest + assert(false && "For the time being this should only be used for code"); return false; } } +u32 LocaliseAddress(int region, u32 num, u32 addr) +{ + switch (region) + { + case memregion_ITCM: + return (addr & (ITCMPhysicalSize - 1)) | (memregion_ITCM << 27); + case memregion_MainRAM: + return (addr & NDS::MainRAMMask) | (memregion_MainRAM << 27); + case memregion_BIOS9: + return (addr & 0xFFF) | (memregion_BIOS9 << 27); + case memregion_BIOS7: + return (addr & 0x3FFF) | (memregion_BIOS7 << 27); + case memregion_SharedWRAM: + if (num == 0) + return ((addr & NDS::SWRAM_ARM9.Mask) + (NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + else + return ((addr & NDS::SWRAM_ARM7.Mask) + (NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM)) | (memregion_SharedWRAM << 27); + case memregion_WRAM7: + return (addr & (NDS::ARM7WRAMSize - 1)) | (memregion_WRAM7 << 27); + case memregion_VRAM: + // TODO: take mapping properly into account + return (addr & 0xFFFFF) | (memregion_VRAM << 27); + case memregion_VWRAM: + // same here + return (addr & 0x3FFFF) | (memregion_VWRAM << 27); + case memregion_NewSharedWRAM_A: + { + u8* ptr = DSi::NWRAMMap_A[num][(addr >> 16) & DSi::NWRAMMask[num][0]]; + if (ptr) + return (ptr - DSi::NWRAM_A + (addr & 0xFFFF)) | (memregion_NewSharedWRAM_A << 27); + else + return memregion_Other << 27; // zero filled memory + } + case memregion_NewSharedWRAM_B: + { + u8* ptr = DSi::NWRAMMap_B[num][(addr >> 15) & DSi::NWRAMMask[num][1]]; + if (ptr) + return (ptr - DSi::NWRAM_B + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_B << 27); + else + return memregion_Other << 27; + } + case memregion_NewSharedWRAM_C: + { + u8* ptr = DSi::NWRAMMap_C[num][(addr >> 15) & DSi::NWRAMMask[num][2]]; + if (ptr) + return (ptr - DSi::NWRAM_C + (addr & 0x7FFF)) | (memregion_NewSharedWRAM_C << 27); + else + return memregion_Other << 27; + } + case memregion_BIOS9DSi: + case memregion_BIOS7DSi: + return (addr & 0xFFFF) | (region << 27); + default: + assert(false && "This should only be needed for regions which can contain code"); + return memregion_Other << 27; + } +} + int ClassifyAddress9(u32 addr) { if (addr < NDS::ARM9->ITCMSize) + { return memregion_ITCM; + } else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + { return memregion_DTCM; - else if ((addr & 0xFFFFF000) == 0xFFFF0000) - return memregion_BIOS9; - else + } + else { + if (NDS::ConsoleType == 1 && addr >= 0xFFFF0000 && !(DSi::SCFG_BIOS & (1<<1))) + { + if ((addr >= 0xFFFF8000) && (DSi::SCFG_BIOS & (1<<0))) + return memregion_Other; + + return memregion_BIOS9DSi; + } + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + { + return memregion_BIOS9; + } + switch (addr & 0xFF000000) { case 0x02000000: return memregion_MainRAM; case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[0][0] && addr < DSi::NWRAMEnd[0][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[0][1] && addr < DSi::NWRAMEnd[0][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[0][2] && addr < DSi::NWRAMEnd[0][2]) + return memregion_NewSharedWRAM_C; + } + if (NDS::SWRAM_ARM9.Mem) - return memregion_SWRAM; - else - return memregion_Other; + return memregion_SharedWRAM; + return memregion_Other; case 0x04000000: return memregion_IO9; case 0x06000000: return memregion_VRAM; + default: + return memregion_Other; } } - return memregion_Other; } int ClassifyAddress7(u32 addr) { - if (addr < 0x00004000) + if (NDS::ConsoleType == 1 && addr < 0x00010000 && !(DSi::SCFG_BIOS & (1<<9))) + { + if (addr >= 0x00008000 && DSi::SCFG_BIOS & (1<<8)) + return memregion_Other; + + return memregion_BIOS7DSi; + } + else if (addr < 0x00004000) + { return memregion_BIOS7; + } else { switch (addr & 0xFF800000) @@ -657,10 +868,19 @@ int ClassifyAddress7(u32 addr) case 0x02800000: return memregion_MainRAM; case 0x03000000: + if (NDS::ConsoleType == 1) + { + if (addr >= DSi::NWRAMStart[1][0] && addr < DSi::NWRAMEnd[1][0]) + return memregion_NewSharedWRAM_A; + if (addr >= DSi::NWRAMStart[1][1] && addr < DSi::NWRAMEnd[1][1]) + return memregion_NewSharedWRAM_B; + if (addr >= DSi::NWRAMStart[1][2] && addr < DSi::NWRAMEnd[1][2]) + return memregion_NewSharedWRAM_C; + } + if (NDS::SWRAM_ARM7.Mem) - return memregion_SWRAM; - else - return memregion_WRAM7; + return memregion_SharedWRAM; + return memregion_WRAM7; case 0x03800000: return memregion_WRAM7; case 0x04000000: @@ -740,14 +960,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } } - switch (size | store) + if (NDS::ConsoleType == 0) + { + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + else { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; - case 16: return (void*)NDS::ARM9IORead16; - case 17: return (void*)NDS::ARM9IOWrite16; - case 32: return (void*)NDS::ARM9IORead32; - case 33: return (void*)NDS::ARM9IOWrite32; + switch (size | store) + { + case 8: return (void*)DSi::ARM9IORead8; + case 9: return (void*)DSi::ARM9IOWrite8; + case 16: return (void*)DSi::ARM9IORead16; + case 17: return (void*)DSi::ARM9IOWrite16; + case 32: return (void*)DSi::ARM9IORead32; + case 33: return (void*)DSi::ARM9IOWrite32; + } } break; case 0x06000000: @@ -781,14 +1016,29 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } } - switch (size | store) + if (NDS::ConsoleType == 0) { - case 8: return (void*)NDS::ARM7IORead8; - case 9: return (void*)NDS::ARM7IOWrite8; - case 16: return (void*)NDS::ARM7IORead16; - case 17: return (void*)NDS::ARM7IOWrite16; - case 32: return (void*)NDS::ARM7IORead32; - case 33: return (void*)NDS::ARM7IOWrite32; + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + } + else + { + switch (size | store) + { + case 8: return (void*)DSi::ARM7IORead8; + case 9: return (void*)DSi::ARM7IOWrite8; + case 16: return (void*)DSi::ARM7IORead16; + case 17: return (void*)DSi::ARM7IOWrite16; + case 32: return (void*)DSi::ARM7IORead32; + case 33: return (void*)DSi::ARM7IOWrite32; + } } break; case 0x04800000: diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h index 1a59d98..123e18e 100644 --- a/src/ARMJIT_Memory.h +++ b/src/ARMJIT_Memory.h @@ -23,7 +23,7 @@ enum memregion_DTCM, memregion_BIOS9, memregion_MainRAM, - memregion_SWRAM, + memregion_SharedWRAM, memregion_IO9, memregion_VRAM, memregion_BIOS7, @@ -31,18 +31,28 @@ enum memregion_IO7, memregion_Wifi, memregion_VWRAM, + + // DSi + memregion_BIOS9DSi, + memregion_BIOS7DSi, + memregion_NewSharedWRAM_A, + memregion_NewSharedWRAM_B, + memregion_NewSharedWRAM_C, + memregions_Count }; int ClassifyAddress9(u32 addr); int ClassifyAddress7(u32 addr); -bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize); +bool GetMirrorLocation(int region, u32 num, u32 addr, u32& memoryOffset, u32& mirrorStart, u32& mirrorSize); +u32 LocaliseAddress(int region, u32 num, u32 addr); -bool IsMappable(int region); +bool IsFastmemCompatible(int region); void RemapDTCM(u32 newBase, u32 newSize); void RemapSWRAM(); +void RemapNWRAM(int num); void SetCodeProtection(int region, u32 offset, bool protect); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 34c1c91..d8bdd56 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -40,6 +40,12 @@ const int RegisterCache::NativeRegsAvailable = #endif ; +#ifdef _WIN32 +const BitSet32 CallerSavedPushRegs({R10, R11}); +#else +const BitSet32 CallerSavedPushRegs({R9, R10, R11}); +#endif + void Compiler::PushRegs(bool saveHiRegs) { BitSet32 loadedRegs(RegCache.LoadedRegs); @@ -301,6 +307,107 @@ Compiler::Compiler() RET(); } + for (int consoleType = 0; consoleType < 2; consoleType++) + { + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 16; reg++) + { + if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2 || reg == ABI_PARAM3) + { + PatchedStoreFuncs[consoleType][num][size][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL; + PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL; + continue; + } + + X64Reg rdMapped = (X64Reg)reg; + PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + MOV(32, R(ABI_PARAM3), R(rdMapped)); + } + else + { + MOV(32, R(ABI_PARAM2), R(rdMapped)); + } + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9); break; + case 33: ABI_CallFunction(SlowWrite7); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 17: ABI_CallFunction(SlowWrite7); break; + case 8: ABI_CallFunction(SlowWrite9); break; + case 9: ABI_CallFunction(SlowWrite7); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowWrite9); break; + case 33: ABI_CallFunction(SlowWrite7); break; + case 16: ABI_CallFunction(SlowWrite9); break; + case 17: ABI_CallFunction(SlowWrite7); break; + case 8: ABI_CallFunction(SlowWrite9); break; + case 9: ABI_CallFunction(SlowWrite7); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr(); + if (RSCRATCH3 != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (num == 0) + MOV(64, R(ABI_PARAM2), R(RCPU)); + ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (consoleType == 0) + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9); break; + case 33: ABI_CallFunction(SlowRead7); break; + case 16: ABI_CallFunction(SlowRead9); break; + case 17: ABI_CallFunction(SlowRead7); break; + case 8: ABI_CallFunction(SlowRead9); break; + case 9: ABI_CallFunction(SlowRead7); break; + } + } + else + { + switch ((8 << size) | num) + { + case 32: ABI_CallFunction(SlowRead9); break; + case 33: ABI_CallFunction(SlowRead7); break; + case 16: ABI_CallFunction(SlowRead9); break; + case 17: ABI_CallFunction(SlowRead7); break; + case 8: ABI_CallFunction(SlowRead9); break; + case 9: ABI_CallFunction(SlowRead7); break; + } + } + ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8); + if (signextend) + MOVSX(32, 8 << size, rdMapped, R(RSCRATCH)); + else + MOVZX(32, 8 << size, rdMapped, R(RSCRATCH)); + RET(); + } + } + } + } + } + // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -500,6 +607,8 @@ void Compiler::Reset() NearCode = NearStart; FarCode = FarStart; + + LoadStorePatches.clear(); } bool Compiler::IsJITFault(u64 addr) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index d1a6c07..0fe0147 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -7,6 +7,8 @@ #include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" +#include + namespace ARMJIT { @@ -18,6 +20,13 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; const Gen::X64Reg RSCRATCH4 = Gen::R8; +struct LoadStorePatch +{ + void* PatchFunc; + s16 Offset; + u16 Size; +}; + struct Op2 { Op2() @@ -211,6 +220,11 @@ public: u8* NearStart; u8* FarStart; + void* PatchedStoreFuncs[2][2][3][16]; + void* PatchedLoadFuncs[2][2][3][2][16]; + + std::unordered_map LoadStorePatches; + u8* ResetStart; u32 CodeMemSize; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b780c55..2da113b 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -17,7 +17,30 @@ int squeezePointer(T* ptr) s32 Compiler::RewriteMemAccess(u64 pc) { - return 0; + auto it = LoadStorePatches.find((u8*)pc); + if (it != LoadStorePatches.end()) + { + LoadStorePatch patch = it->second; + LoadStorePatches.erase(it); + + u8* curCodePtr = GetWritableCodePtr(); + u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset; + SetCodePtr(rewritePtr); + + CALL(patch.PatchFunc); + u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr); + if (remainingSize > 0) + NOP(remainingSize); + + //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size); + + SetCodePtr(curCodePtr); + + return patch.Offset; + } + + printf("this is a JIT bug %x\n", pc); + abort(); } /* @@ -91,369 +114,213 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag return; } + if (flags & memop_Store) { - if (flags & memop_Store) - { - Comp_AddCycles_CD(); - } - else - { - Comp_AddCycles_CDI(); - } + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - bool addrIsStatic = Config::JIT_LiteralOptimisations - && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); - u32 staticAddress; - if (addrIsStatic) - staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - OpArg rdMapped = MapReg(rd); + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + OpArg rdMapped = MapReg(rd); - if (true) - { - OpArg rnMapped = MapReg(rn); - if (Thumb && rn == 15) - rnMapped = Imm32(R15 & ~0x2); + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); - X64Reg finalAddr = RSCRATCH3; - if (flags & memop_Post) - { - MOV(32, R(RSCRATCH3), rnMapped); + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) + { + MOV(32, R(RSCRATCH3), rnMapped); - finalAddr = rnMapped.GetSimpleReg(); - } + finalAddr = rnMapped.GetSimpleReg(); + } - if (op2.IsImm) + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) { - MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); } else - { - OpArg rm = MapReg(op2.Reg.Reg); + MOV_sum(32, finalAddr, rnMapped, offset); + } + } - if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() - && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) - { - LEA(32, finalAddr, - MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); - } - else - { - bool throwAway; - OpArg offset = - Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); - if (flags & memop_SubtractOffset) - { - if (R(finalAddr) != rnMapped) - MOV(32, R(finalAddr), rnMapped); - if (!offset.IsZero()) - SUB(32, R(finalAddr), offset); - } - else - MOV_sum(32, finalAddr, rnMapped, offset); - } - } + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); - if ((flags & memop_Writeback) && !(flags & memop_Post)) - MOV(32, rnMapped, R(finalAddr)); - } + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsFastmemCompatible(expectedTarget))) + { + u8* memopStart = GetWritableCodePtr(); + LoadStorePatch patch; + + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()] + : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()]; - /*int expectedTarget = Num == 0 - ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) - : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - if (CurInstr.Cond() < 0xE) - expectedTarget = memregion_Other; + assert(patch.PatchFunc != NULL); - bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store); + MOV(64, R(RSCRATCH), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); - switch (expectedTarget) + X64Reg maskedAddr = RSCRATCH3; + if (size > 8) { - case memregion_MainRAM: - case memregion_DTCM: - case memregion_WRAM7: - case memregion_SWRAM9: - case memregion_SWRAM7: - case memregion_IO9: - case memregion_IO7: - case memregion_VWRAM: - compileFastPath = true; - break; - case memregion_Wifi: - compileFastPath = size >= 16; - break; - case memregion_VRAM: - compileFastPath = !(flags & memop_Store) || size >= 16; - case memregion_BIOS9: - compileFastPath = !(flags & memop_Store); - break; - default: break; + maskedAddr = RSCRATCH2; + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + AND(32, R(RSCRATCH2), Imm8(addressMask)); } - if (addrIsStatic && !compileFastPath) + u8* memopLoadStoreLocation = GetWritableCodePtr(); + if (flags & memop_Store) { - compileFastPath = false; - compileSlowPath = true; + MOV(size, MRegSum(RSCRATCH, maskedAddr), rdMapped); } - - if (addrIsStatic && compileSlowPath) - MOV(32, R(RSCRATCH3), Imm32(staticAddress)); -*/ - /*if (compileFastPath) + else { - FixupBranch slowPath; - if (compileSlowPath) - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - SHR(32, R(RSCRATCH), Imm8(9)); - if (flags & memop_Store) - { - CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); - } - else - { - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); - AND(32, R(RSCRATCH), Imm8(~0x80)); - CMP(32, R(RSCRATCH), Imm8(expectedTarget)); - } - - slowPath = J_CC(CC_NE, true); - } + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), MRegSum(RSCRATCH, maskedAddr)); - if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7 - || expectedTarget == memregion_BIOS9) + if (size == 32) { - u8* data; - u32 mask; - if (expectedTarget == memregion_MainRAM) - { - data = NDS::MainRAM; - mask = MAIN_RAM_SIZE - 1; - } - else if (expectedTarget == memregion_BIOS9) - { - data = NDS::ARM9BIOS; - mask = 0xFFF; - } - else - { - data = NDS::ARM7WRAM; - mask = 0xFFFF; - } - OpArg memLoc; - if (addrIsStatic) - { - memLoc = M(data + ((staticAddress & mask & addressMask))); - } - else - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - AND(32, R(RSCRATCH), Imm32(mask & addressMask)); - memLoc = MDisp(RSCRATCH, squeezePointer(data)); - } - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); - } - else if (expectedTarget == memregion_DTCM) - { - if (addrIsStatic) - MOV(32, R(RSCRATCH), Imm32(staticAddress)); - else - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); - OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)); - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); - } - else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7) - { - MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); - if (addrIsStatic) - { - MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask)); - } - else - { - MOV(32, R(RSCRATCH), R(RSCRATCH3)); - AND(32, R(RSCRATCH), Imm8(addressMask)); - } - AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); - OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2); - if (flags & memop_Store) - MOV(size, memLoc, rdMapped); - else if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); } - else - { - u32 maskedDataRegion; - - if (addrIsStatic) - { - maskedDataRegion = staticAddress; - MOV(32, R(ABI_PARAM1), Imm32(staticAddress)); - } - else - { - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - AND(32, R(ABI_PARAM1), Imm8(addressMask)); - - maskedDataRegion = CurInstr.DataRegion; - if (Num == 0) - maskedDataRegion &= ~0xFFFFFF; - else - maskedDataRegion &= ~0x7FFFFF; - } + } - void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size); + patch.Offset = memopStart - memopLoadStoreLocation; + patch.Size = GetWritableCodePtr() - memopStart; - if (flags & memop_Store) - { - PushRegs(false); + assert(patch.Size >= 5); - MOV(32, R(ABI_PARAM2), rdMapped); + LoadStorePatches[memopLoadStoreLocation] = patch; + } + else + { + PushRegs(false); - ABI_CallFunction((void(*)())func); + if (Num == 0) + { + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); - PopRegs(false); - } - else + switch (size | NDS::ConsoleType) { - if (!addrIsStatic) - MOV(32, rdMapped, R(RSCRATCH3)); - - PushRegs(false); - - ABI_CallFunction((void(*)())func); - - PopRegs(false); - - if (!addrIsStatic) - MOV(32, R(RSCRATCH3), rdMapped); - - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + case 32: CALL((void*)&SlowWrite9); break; + case 16: CALL((void*)&SlowWrite9); break; + case 8: CALL((void*)&SlowWrite9); break; + case 33: CALL((void*)&SlowWrite9); break; + case 17: CALL((void*)&SlowWrite9); break; + case 9: CALL((void*)&SlowWrite9); break; } } - - if ((size == 32 && !(flags & memop_Store))) + else { - if (addrIsStatic) - { - if (staticAddress & 0x3) - ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8)); - } - else + switch (size | NDS::ConsoleType) { - AND(32, R(RSCRATCH3), Imm8(0x3)); - SHL(32, R(RSCRATCH3), Imm8(3)); - ROR_(32, rdMapped, R(RSCRATCH3)); + case 32: CALL((void*)&SlowRead9); break; + case 16: CALL((void*)&SlowRead9); break; + case 8: CALL((void*)&SlowRead9); break; + case 33: CALL((void*)&SlowRead9); break; + case 17: CALL((void*)&SlowRead9); break; + case 9: CALL((void*)&SlowRead9); break; } } - - if (compileSlowPath) - { - SwitchToFarCode(); - SetJumpTarget(slowPath); - } } -*/ - if (true) + else { - PushRegs(false); - - if (Num == 0) + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) { - MOV(64, R(ABI_PARAM2), R(RCPU)); - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - if (flags & memop_Store) - { - MOV(32, R(ABI_PARAM3), rdMapped); + MOV(32, R(ABI_PARAM2), rdMapped); - switch (size) - { - case 32: CALL((void*)&SlowWrite9); break; - case 16: CALL((void*)&SlowWrite9); break; - case 8: CALL((void*)&SlowWrite9); break; - } - } - else + switch (size | NDS::ConsoleType) { - switch (size) - { - case 32: CALL((void*)&SlowRead9); break; - case 16: CALL((void*)&SlowRead9); break; - case 8: CALL((void*)&SlowRead9); break; - } + case 32: CALL((void*)&SlowWrite7); break; + case 16: CALL((void*)&SlowWrite7); break; + case 8: CALL((void*)&SlowWrite7); break; + case 33: CALL((void*)&SlowWrite7); break; + case 17: CALL((void*)&SlowWrite7); break; + case 9: CALL((void*)&SlowWrite7); break; } } else { - if (ABI_PARAM1 != RSCRATCH3) - MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); - if (flags & memop_Store) + switch (size | NDS::ConsoleType) { - MOV(32, R(ABI_PARAM2), rdMapped); - - switch (size) - { - case 32: CALL((void*)&SlowWrite7); break; - case 16: CALL((void*)&SlowWrite7); break; - case 8: CALL((void*)&SlowWrite7); break; - } - } - else - { - switch (size) - { - case 32: CALL((void*)&SlowRead7); break; - case 16: CALL((void*)&SlowRead7); break; - case 8: CALL((void*)&SlowRead7); break; - } + case 32: CALL((void*)&SlowRead7); break; + case 16: CALL((void*)&SlowRead7); break; + case 8: CALL((void*)&SlowRead7); break; + case 33: CALL((void*)&SlowRead7); break; + case 17: CALL((void*)&SlowRead7); break; + case 9: CALL((void*)&SlowRead7); break; } } + } - PopRegs(false); + PopRegs(false); - if (!(flags & memop_Store)) - { - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); - } - } -/* - if (compileFastPath && compileSlowPath) + if (!(flags & memop_Store)) { - FixupBranch ret = J(true); - SwitchToNearCode(); - SetJumpTarget(ret); - }*/ + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } - if (!(flags & memop_Store) && rd == 15) + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); { - if (size < 32) - printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + if (Num == 1) { - if (Num == 1) - AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rdMapped.GetSimpleReg()); + if (Thumb) + OR(32, rdMapped, Imm8(0x1)); + else + AND(32, rdMapped, Imm8(0xFE)); } + Comp_JumpTo(rdMapped.GetSimpleReg()); } } } @@ -470,7 +337,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc int flags = 0; if (store) flags |= memop_Store; - if (decrement) + if (decrement && preinc) flags |= memop_SubtractOffset; Op2 offset = preinc ? Op2(4) : Op2(0); @@ -481,96 +348,52 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - // we need to make sure that the stack stays aligned to 16 bytes -#ifdef _WIN32 - // include shadow - u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8; -#else - u32 stackAlloc = ((regsCount + 1) & ~1) * 8; -#endif - u32 allocOffset = stackAlloc - regsCount * 8; -/* int expectedTarget = Num == 0 - ? ClassifyAddress9(CurInstr.DataRegion) - : ClassifyAddress7(CurInstr.DataRegion); - if (usermode || CurInstr.Cond() < 0xE) - expectedTarget = memregion_Other; - - bool compileFastPath = false; + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); - switch (expectedTarget) - { - case memregion_DTCM: - case memregion_MainRAM: - case memregion_SWRAM9: - case memregion_SWRAM7: - case memregion_WRAM7: - compileFastPath = true; - break; - default: - break; - } -*/ if (!store) Comp_AddCycles_CDI(); else Comp_AddCycles_CD(); + bool compileFastPath = Config::JIT_FastMemory + && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsFastmemCompatible(expectedTarget)); + + // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = (((regsCount + 4 + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#else + u32 stackAlloc = (((regsCount + 1) & ~1) + (compileFastPath ? 1 : 0)) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; + if (decrement) - { - MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; - } + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4 + (preinc ? 0 : 4))); else - MOV(32, R(RSCRATCH4), MapReg(rn)); -/* + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(preinc ? 4 : 0)); + if (compileFastPath) { - assert(!usermode); + AND(32, R(RSCRATCH4), Imm8(~3)); - MOV(32, R(RSCRATCH), R(RSCRATCH4)); - SHR(32, R(RSCRATCH), Imm8(9)); + u8* fastPathStart = GetWritableCodePtr(); + u8* firstLoadStoreAddr; - if (store) - { - CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); - } - else - { - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); - AND(32, R(RSCRATCH), Imm8(~0x80)); - CMP(32, R(RSCRATCH), Imm8(expectedTarget)); - } - FixupBranch slowPath = J_CC(CC_NE, true); + bool firstLoadStore = true; + + MOV(64, R(RSCRATCH2), ImmPtr(Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start)); + ADD(64, R(RSCRATCH2), R(RSCRATCH4)); + MOV(32, R(RSCRATCH3), R(RSCRATCH4)); - if (expectedTarget == memregion_DTCM) - { - SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3)); - LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM))); - } - else if (expectedTarget == memregion_MainRAM) - { - AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3)); - ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM))); - } - else if (expectedTarget == memregion_WRAM7) - { - AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3)); - ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM))); - } - else // SWRAM - { - AND(32, R(RSCRATCH4), Imm8(~3)); - AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); - ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); - } u32 offset = 0; for (int reg : regs) { - if (preinc) - offset += 4; - OpArg mem = MDisp(RSCRATCH4, offset); + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); + + OpArg mem = MDisp(RSCRATCH2, offset); if (store) { if (RegCache.LoadedRegs & (1 << reg)) @@ -580,6 +403,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc else { LoadReg(reg, RSCRATCH); + if (firstLoadStore) + firstLoadStoreAddr = GetWritableCodePtr(); MOV(32, mem, R(RSCRATCH)); } } @@ -595,13 +420,19 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SaveReg(reg, RSCRATCH); } } - if (!preinc) - offset += 4; + offset += 4; + + firstLoadStore = false; } + LoadStorePatch patch; + patch.Size = GetWritableCodePtr() - fastPathStart; + patch.Offset = fastPathStart - firstLoadStoreAddr; SwitchToFarCode(); - SetJumpTarget(slowPath); - }*/ + patch.PatchFunc = GetWritableCodePtr(); + + LoadStorePatches[firstLoadStoreAddr] = patch; + } if (!store) { @@ -618,12 +449,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (Num == 0) MOV(64, R(ABI_PARAM4), R(RCPU)); - switch (Num * 2 | preinc) + switch (Num * 2 | NDS::ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } PopRegs(false); @@ -715,25 +546,24 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (Num == 0) MOV(64, R(ABI_PARAM4), R(RCPU)); - switch (Num * 2 | preinc) + switch (Num * 2 | NDS::ConsoleType) { - case 0: CALL((void*)&SlowBlockTransfer9); break; - case 1: CALL((void*)&SlowBlockTransfer9); break; - case 2: CALL((void*)&SlowBlockTransfer7); break; - case 3: CALL((void*)&SlowBlockTransfer7); break; + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); PopRegs(false); } -/* + if (compileFastPath) { - FixupBranch ret = J(true); + RET(); SwitchToNearCode(); - SetJumpTarget(ret); - }*/ + } if (!store && regs[15]) { diff --git a/src/CP15.cpp b/src/CP15.cpp index 3d64259..992c83f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -608,6 +608,27 @@ void ARMv5::CP15Write(u32 id, u32 val) ITCMSetting = val; UpdateITCMSetting(); return; + + case 0xF00: + //printf("cache debug index register %08X\n", val); + return; + + case 0xF10: + //printf("cache debug instruction tag %08X\n", val); + return; + + case 0xF20: + //printf("cache debug data tag %08X\n", val); + return; + + case 0xF30: + //printf("cache debug instruction cache %08X\n", val); + return; + + case 0xF40: + //printf("cache debug data cache %08X\n", val); + return; + } if ((id&0xF00)!=0x700) diff --git a/src/Config.cpp b/src/Config.cpp index edf84f2..de1c70d 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -40,14 +40,7 @@ char DSiNANDPath[1024]; #ifdef JIT_ENABLED int JIT_Enable = false; int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = 2; -int JIT_LiteralOptimisations = true; -#endif - -#ifdef JIT_ENABLED -int JIT_Enable = false; -int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = true; +int JIT_BranchOptimisations = 2; int JIT_LiteralOptimisations = true; int JIT_FastMemory = true; #endif @@ -66,16 +59,9 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, - {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, -#endif - -#ifdef JIT_ENABLED - {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, - {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 2, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, - {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0}, + {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 7b19a4b..5916b4a 100644 --- a/src/Config.h +++ b/src/Config.h @@ -54,14 +54,7 @@ extern char DSiNANDPath[1024]; #ifdef JIT_ENABLED extern int JIT_Enable; extern int JIT_MaxBlockSize; -extern int JIT_BrancheOptimisations; -extern int JIT_LiteralOptimisations; -#endif - -#ifdef JIT_ENABLED -extern int JIT_Enable; -extern int JIT_MaxBlockSize; -extern int JIT_BrancheOptimisations; +extern int JIT_BranchOptimisations; extern int JIT_LiteralOptimisations; extern int JIT_FastMemory; #endif diff --git a/src/DSi.cpp b/src/DSi.cpp index 216f724..97a63cd 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -26,6 +26,11 @@ #include "NDSCart.h" #include "Platform.h" +#ifdef JIT_ENABLED +#include "ARMJIT.h" +#include "ARMJIT_Memory.h" +#endif + #include "DSi_NDMA.h" #include "DSi_I2C.h" #include "DSi_SD.h" @@ -34,15 +39,6 @@ #include "tiny-AES-c/aes.hpp" -namespace NDS -{ - -extern ARMv5* ARM9; -extern ARMv4* ARM7; - -} - - namespace DSi { @@ -59,9 +55,9 @@ u8 ARM7iBIOS[0x10000]; u32 MBK[2][9]; -u8 NWRAM_A[0x40000]; -u8 NWRAM_B[0x40000]; -u8 NWRAM_C[0x40000]; +u8* NWRAM_A; +u8* NWRAM_B; +u8* NWRAM_C; u8* NWRAMMap_A[2][4]; u8* NWRAMMap_B[3][8]; @@ -86,6 +82,12 @@ u8 ARM7Init[0x3C00]; bool Init() { +#ifndef JIT_ENABLED + NWRAM_A = new u8[NWRAMSize]; + NWRAM_B = new u8[NWRAMSize]; + NWRAM_C = new u8[NWRAMSize]; +#endif + if (!DSi_I2C::Init()) return false; if (!DSi_AES::Init()) return false; @@ -106,6 +108,12 @@ bool Init() void DeInit() { +#ifndef JIT_ENABLED + delete[] NWRAM_A; + delete[] NWRAM_B; + delete[] NWRAM_C; +#endif + DSi_I2C::DeInit(); DSi_AES::DeInit(); @@ -176,7 +184,12 @@ void SoftReset() NDS::ARM9->Reset(); NDS::ARM7->Reset(); + NDS::ARM9->CP15Reset(); + memcpy(NDS::ARM9->ITCM, ITCMInit, 0x8000); +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidateITCM(); +#endif DSi_AES::Reset(); @@ -274,9 +287,9 @@ bool LoadNAND() { printf("Loading DSi NAND\n"); - memset(NWRAM_A, 0, 0x40000); - memset(NWRAM_B, 0, 0x40000); - memset(NWRAM_C, 0, 0x40000); + memset(NWRAM_A, 0, NWRAMSize); + memset(NWRAM_B, 0, NWRAMSize); + memset(NWRAM_C, 0, NWRAMSize); memset(MBK, 0, sizeof(MBK)); memset(NWRAMMap_A, 0, sizeof(NWRAMMap_A)); @@ -527,6 +540,8 @@ void MapNWRAM_A(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(0); + int mbkn = 0, mbks = 8*num; u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -558,6 +573,8 @@ void MapNWRAM_B(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(1); + int mbkn = 1+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -593,6 +610,8 @@ void MapNWRAM_C(u32 num, u8 val) return; } + ARMJIT_Memory::RemapNWRAM(2); + int mbkn = 3+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; @@ -625,6 +644,8 @@ void MapNWRAMRange(u32 cpu, u32 num, u32 val) u32 oldval = MBK[cpu][5+num]; if (oldval == val) return; + ARMJIT_Memory::RemapNWRAM(num); + MBK[cpu][5+num] = val; // TODO: what happens when the ranges are 'out of range'???? @@ -826,19 +847,31 @@ void ARM9Write8(u32 addr, u8 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write8(addr, val); @@ -859,19 +892,31 @@ void ARM9Write16(u32 addr, u16 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write16(addr, val); @@ -892,19 +937,31 @@ void ARM9Write32(u32 addr, u32 val) if (addr >= NWRAMStart[0][0] && addr < NWRAMEnd[0][0]) { u8* ptr = NWRAMMap_A[0][(addr >> 16) & NWRAMMask[0][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[0][1] && addr < NWRAMEnd[0][1]) { u8* ptr = NWRAMMap_B[0][(addr >> 15) & NWRAMMask[0][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[0][2] && addr < NWRAMEnd[0][2]) { u8* ptr = NWRAMMap_C[0][(addr >> 15) & NWRAMMask[0][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM9Write32(addr, val); @@ -1085,19 +1142,37 @@ void ARM7Write8(u32 addr, u8 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u8*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0xFFFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); +#endif + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u8*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u8*)&ptr[addr & 0x7FFF] = val; +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); +#endif + } return; } return NDS::ARM7Write8(addr, val); @@ -1118,19 +1193,31 @@ void ARM7Write16(u32 addr, u16 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u16*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u16*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u16*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write16(addr, val); @@ -1151,19 +1238,31 @@ void ARM7Write32(u32 addr, u32 val) if (addr >= NWRAMStart[1][0] && addr < NWRAMEnd[1][0]) { u8* ptr = NWRAMMap_A[1][(addr >> 16) & NWRAMMask[1][0]]; - if (ptr) *(u32*)&ptr[addr & 0xFFFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0xFFFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_A>(addr); + } return; } if (addr >= NWRAMStart[1][1] && addr < NWRAMEnd[1][1]) { u8* ptr = NWRAMMap_B[1][(addr >> 15) & NWRAMMask[1][1]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_B>(addr); + } return; } if (addr >= NWRAMStart[1][2] && addr < NWRAMEnd[1][2]) { u8* ptr = NWRAMMap_C[1][(addr >> 15) & NWRAMMask[1][2]]; - if (ptr) *(u32*)&ptr[addr & 0x7FFF] = val; + if (ptr) + { + *(u32*)&ptr[addr & 0x7FFF] = val; + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_NewSharedWRAM_C>(addr); + } return; } return NDS::ARM7Write32(addr, val); @@ -1521,7 +1620,7 @@ u8 ARM7IORead8(u32 addr) case 0x04004501: return DSi_I2C::Cnt; case 0x04004D00: if (SCFG_BIOS & (1<<10)) return 0; return ConsoleID & 0xFF; - case 0x04004D01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; + case 0x04004fD01: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 8) & 0xFF; case 0x04004D02: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 16) & 0xFF; case 0x04004D03: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 24) & 0xFF; case 0x04004D04: if (SCFG_BIOS & (1<<10)) return 0; return (ConsoleID >> 32) & 0xFF; diff --git a/src/DSi.h b/src/DSi.h index 8cc8fd5..40f22bb 100644 --- a/src/DSi.h +++ b/src/DSi.h @@ -25,6 +25,8 @@ namespace DSi { +extern u16 SCFG_BIOS; + extern u8 ARM9iBIOS[0x10000]; extern u8 ARM7iBIOS[0x10000]; @@ -34,6 +36,19 @@ extern u64 ConsoleID; extern DSi_SDHost* SDMMC; extern DSi_SDHost* SDIO; +const u32 NWRAMSize = 0x40000; + +extern u8* NWRAM_A; +extern u8* NWRAM_B; +extern u8* NWRAM_C; + +extern u8* NWRAMMap_A[2][4]; +extern u8* NWRAMMap_B[3][8]; +extern u8* NWRAMMap_C[3][8]; + +extern u32 NWRAMStart[2][3]; +extern u32 NWRAMEnd[2][3]; +extern u32 NWRAMMask[2][3]; bool Init(); void DeInit(); diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp index 9984f5e..e22c708 100644 --- a/src/DSi_I2C.cpp +++ b/src/DSi_I2C.cpp @@ -21,6 +21,7 @@ #include "DSi.h" #include "DSi_I2C.h" #include "DSi_Camera.h" +#include "ARM.h" namespace DSi_BPTWL @@ -108,7 +109,8 @@ void Write(u8 val, bool last) printf("BPTWL: soft-reset\n"); val = 0; // checkme // TODO: soft-reset might need to be scheduled later! - DSi::SoftReset(); + // TODO: this has been moved for the JIT to work, nothing is confirmed here + NDS::ARM7->Halt(4); CurPos = -1; return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 3d65482..6981a42 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -32,8 +32,11 @@ #include "Wifi.h" #include "AREngine.h" #include "Platform.h" + +#ifdef JIT_ENABLED #include "ARMJIT.h" #include "ARMJIT_Memory.h" +#endif #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -173,7 +176,7 @@ bool Init() #ifdef JIT_ENABLED ARMJIT::Init(); #else - MainRAM = new u8[MainRAMSize]; + MainRAM = new u8[0x1000000]; ARM7WRAM = new u8[ARM7WRAMSize]; SharedWRAM = new u8[SharedWRAMSize]; #endif @@ -1837,7 +1840,7 @@ u8 ARM9Read8(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u8*)&MainRAM[addr & (MainRAMSize - 1)]; + return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: if (SWRAM_ARM9.Mem) @@ -1902,7 +1905,7 @@ u16 ARM9Read16(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u16*)&MainRAM[addr & (MainRAMSize - 1)]; + return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: if (SWRAM_ARM9.Mem) @@ -2031,16 +2034,13 @@ void ARM9Write8(u32 addr, u8 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2090,16 +2090,13 @@ void ARM9Write16(u32 addr, u16 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2168,16 +2165,13 @@ void ARM9Write32(u32 addr, u32 val) ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return ; case 0x03000000: if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } @@ -2235,7 +2229,7 @@ void ARM9Write32(u32 addr, u32 val) return; } - printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); + //printf("unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9->R[15]); } bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) @@ -2475,16 +2469,13 @@ void ARM7Write8(u32 addr, u8 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; @@ -2552,16 +2543,13 @@ void ARM7Write16(u32 addr, u16 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; @@ -2639,16 +2627,13 @@ void ARM7Write32(u32 addr, u32 val) ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; -#ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); -#endif return; case 0x03000000: if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SharedWRAM>(addr); #endif *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; diff --git a/src/NDS.h b/src/NDS.h index 4b4f9a1..e0a5045 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -165,6 +165,8 @@ extern u16 ARM7BIOSProt; extern u8* MainRAM; extern u32 MainRAMMask; +const u32 MainRAMMaxSize = 0x1000000; + const u32 SharedWRAMSize = 0x8000; extern u8* SharedWRAM; diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp index 09faf4e..9ee7b9a 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp +++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp @@ -32,6 +32,7 @@ EmuSettingsDialog* EmuSettingsDialog::currentDlg = nullptr; extern char* EmuDirectory; +extern bool RunningSomething; EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::EmuSettingsDialog) @@ -53,6 +54,22 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new ui->cbxConsoleType->setCurrentIndex(Config::ConsoleType); ui->chkDirectBoot->setChecked(Config::DirectBoot != 0); + +#ifdef JIT_ENABLED + ui->chkEnableJIT->setChecked(Config::JIT_Enable != 0); + ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0); + ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0); + ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0); + ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize); +#else + ui->chkEnableJIT->setDisabled(true); + ui->chkJITBranchOptimisations->setDisabled(true); + ui->chkJITLiteralOptimisations->setDisabled(true); + ui->chkJITFastMemory->setDisabled(true); + ui->spnJITMaximumBlockSize->setDisabled(true); +#endif + + on_chkEnableJIT_toggled(); } EmuSettingsDialog::~EmuSettingsDialog() @@ -102,29 +119,78 @@ void EmuSettingsDialog::verifyFirmware() } } -void EmuSettingsDialog::on_EmuSettingsDialog_accepted() +void EmuSettingsDialog::done(int r) { - verifyFirmware(); - - strncpy(Config::BIOS9Path, ui->txtBIOS9Path->text().toStdString().c_str(), 1023); Config::BIOS9Path[1023] = '\0'; - strncpy(Config::BIOS7Path, ui->txtBIOS7Path->text().toStdString().c_str(), 1023); Config::BIOS7Path[1023] = '\0'; - strncpy(Config::FirmwarePath, ui->txtFirmwarePath->text().toStdString().c_str(), 1023); Config::FirmwarePath[1023] = '\0'; - - strncpy(Config::DSiBIOS9Path, ui->txtDSiBIOS9Path->text().toStdString().c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; - strncpy(Config::DSiBIOS7Path, ui->txtDSiBIOS7Path->text().toStdString().c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; - strncpy(Config::DSiFirmwarePath, ui->txtDSiFirmwarePath->text().toStdString().c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; - strncpy(Config::DSiNANDPath, ui->txtDSiNANDPath->text().toStdString().c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; - - Config::ConsoleType = ui->cbxConsoleType->currentIndex(); - Config::DirectBoot = ui->chkDirectBoot->isChecked() ? 1:0; - - Config::Save(); + if (r == QDialog::Accepted) + { + verifyFirmware(); + + int consoleType = ui->cbxConsoleType->currentIndex(); + int directBoot = ui->chkDirectBoot->isChecked() ? 1:0; + + int jitEnable = ui->chkEnableJIT->isChecked() ? 1:0; + int jitMaxBlockSize = ui->spnJITMaximumBlockSize->value(); + int jitBranchOptimisations = ui->chkJITBranchOptimisations->isChecked() ? 1:0; + int jitLiteralOptimisations = ui->chkJITLiteralOptimisations->isChecked() ? 1:0; + int jitFastMemory = ui->chkJITFastMemory->isChecked() ? 1:0; + + std::string bios9Path = ui->txtBIOS9Path->text().toStdString(); + std::string bios7Path = ui->txtBIOS7Path->text().toStdString(); + std::string firmwarePath = ui->txtFirmwarePath->text().toStdString(); + std::string dsiBios9Path = ui->txtDSiBIOS9Path->text().toStdString(); + std::string dsiBios7Path = ui->txtDSiBIOS7Path->text().toStdString(); + std::string dsiFirmwarePath = ui->txtDSiFirmwarePath->text().toStdString(); + std::string dsiNANDPath = ui->txtDSiNANDPath->text().toStdString(); + + if (consoleType != Config::ConsoleType + || directBoot != Config::DirectBoot +#ifdef JIT_ENABLED + || jitEnable != Config::JIT_Enable + || jitMaxBlockSize != Config::JIT_MaxBlockSize + || jitBranchOptimisations != Config::JIT_BranchOptimisations + || jitLiteralOptimisations != Config::JIT_LiteralOptimisations + || jitFastMemory != Config::JIT_FastMemory +#endif + || strcmp(Config::BIOS9Path, bios9Path.c_str()) != 0 + || strcmp(Config::BIOS7Path, bios7Path.c_str()) != 0 + || strcmp(Config::FirmwarePath, firmwarePath.c_str()) != 0 + || strcmp(Config::DSiBIOS9Path, dsiBios9Path.c_str()) != 0 + || strcmp(Config::DSiBIOS7Path, dsiBios7Path.c_str()) != 0 + || strcmp(Config::DSiFirmwarePath, dsiFirmwarePath.c_str()) != 0 + || strcmp(Config::DSiNANDPath, dsiNANDPath.c_str()) != 0) + { + if (RunningSomething + && QMessageBox::warning(this, "Reset necessary to apply changes", + "The emulation will be reset for the changes to take place", + QMessageBox::Yes, QMessageBox::Cancel) != QMessageBox::Yes) + return; + + strncpy(Config::BIOS9Path, bios9Path.c_str(), 1023); Config::BIOS9Path[1023] = '\0'; + strncpy(Config::BIOS7Path, bios7Path.c_str(), 1023); Config::BIOS7Path[1023] = '\0'; + strncpy(Config::FirmwarePath, firmwarePath.c_str(), 1023); Config::FirmwarePath[1023] = '\0'; + + strncpy(Config::DSiBIOS9Path, dsiBios9Path.c_str(), 1023); Config::DSiBIOS9Path[1023] = '\0'; + strncpy(Config::DSiBIOS7Path, dsiBios7Path.c_str(), 1023); Config::DSiBIOS7Path[1023] = '\0'; + strncpy(Config::DSiFirmwarePath, dsiFirmwarePath.c_str(), 1023); Config::DSiFirmwarePath[1023] = '\0'; + strncpy(Config::DSiNANDPath, dsiNANDPath.c_str(), 1023); Config::DSiNANDPath[1023] = '\0'; + + #ifdef JIT_ENABLED + Config::JIT_Enable = jitEnable; + Config::JIT_MaxBlockSize = jitMaxBlockSize; + Config::JIT_BranchOptimisations = jitBranchOptimisations; + Config::JIT_LiteralOptimisations = jitLiteralOptimisations; + Config::JIT_FastMemory = jitFastMemory; + #endif + + Config::ConsoleType = consoleType; + Config::DirectBoot = directBoot; + + Config::Save(); + } + } - closeDlg(); -} + QDialog::done(r); -void EmuSettingsDialog::on_EmuSettingsDialog_rejected() -{ closeDlg(); } @@ -211,3 +277,12 @@ void EmuSettingsDialog::on_btnDSiNANDBrowse_clicked() ui->txtDSiNANDPath->setText(file); } + +void EmuSettingsDialog::on_chkEnableJIT_toggled() +{ + bool disabled = !ui->chkEnableJIT->isChecked(); + ui->chkJITBranchOptimisations->setDisabled(disabled); + ui->chkJITLiteralOptimisations->setDisabled(disabled); + ui->chkJITFastMemory->setDisabled(disabled); + ui->spnJITMaximumBlockSize->setDisabled(disabled); +} \ No newline at end of file diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.h b/src/frontend/qt_sdl/EmuSettingsDialog.h index f604ba5..268036c 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.h +++ b/src/frontend/qt_sdl/EmuSettingsDialog.h @@ -51,8 +51,7 @@ public: } private slots: - void on_EmuSettingsDialog_accepted(); - void on_EmuSettingsDialog_rejected(); + void done(int r); void on_btnBIOS9Browse_clicked(); void on_btnBIOS7Browse_clicked(); @@ -63,6 +62,8 @@ private slots: void on_btnDSiFirmwareBrowse_clicked(); void on_btnDSiNANDBrowse_clicked(); + void on_chkEnableJIT_toggled(); + private: void verifyFirmware(); diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.ui b/src/frontend/qt_sdl/EmuSettingsDialog.ui index 4894fa5..11d48cc 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.ui +++ b/src/frontend/qt_sdl/EmuSettingsDialog.ui @@ -6,8 +6,8 @@ 0 0 - 490 - 392 + 514 + 359 @@ -24,243 +24,336 @@ QLayout::SetFixedSize - - - DS mode + + + 0 - - - - - - 0 - 0 - - - - - 290 - 0 - - - - - - - <html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html> - - - - - - - DS firmware: - - - - - - - DS ARM7 BIOS: - - - - - - - DS ARM9 BIOS: - - - - - - - - 0 - 0 - - - - Browse... - - - true - - - - - - - <html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html> - - - - - - - Browse... - - - - - - - <html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html> - - - - - - - Browse... - - - - - - - - - - DSi mode - - - - - - Browse... - - - - - - - DSi ARM9 BIOS: - - - - - - - Browse... - - - - - - - <html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> - - - - - - - <html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html> - - - - - - - DSi ARM7 BIOS: - - - - - - - DSi firmware: - - - - - - - Browse... - - - - - - - - 0 - 0 - - - - <html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> - - - - - - - DSi NAND: - - - - - - - <html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html> - - - - - - - Browse... - - - - - - - - - - General - - - - - - - 0 - 0 - - - - Console type: - - - - - - - - 0 - 0 - - - - <html><head/><body><p>The type of console to emulate</p></body></html> - - - - - - - <html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html> - - - Boot game directly - - - - + + + General + + + + + + + 0 + 0 + + + + <html><head/><body><p>The type of console to emulate</p></body></html> + + + + + + + <html><head/><body><p>When loading a ROM, completely skip the regular boot process (&quot;Nintendo DS&quot; screen) to boot the ROM directly.</p><p><br/></p><p>Note: if your firmware dump isn't bootable, the ROM will be booted directly regardless of this setting.</p></body></html> + + + Boot game directly + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + 0 + 0 + + + + Console type: + + + + + + + + BIOS Files + + + + + + DS mode + + + + + + DS firmware: + + + + + + + <html><head/><body><p>DS-mode firmware</p><p><br/></p><p>Possible firmwares:</p><p>* 128 KB: DS-mode firmware from a DSi or 3DS. Not bootable.</p><p>* 256 KB: regular DS firmware.</p><p>* 512 KB: iQue DS firmware.</p></body></html> + + + + + + + <html><head/><body><p>DS-mode ARM7 BIOS</p><p>Size should be 16 KB</p></body></html> + + + + + + + + 0 + 0 + + + + Browse... + + + true + + + + + + + Browse... + + + + + + + DS ARM7 BIOS: + + + + + + + DS ARM9 BIOS: + + + + + + + Browse... + + + + + + + + 0 + 0 + + + + + 290 + 0 + + + + + + + <html><head/><body><p>DS-mode ARM9 BIOS</p><p>Size should be 4 KB</p></body></html> + + + + + + + + + + DSi mode + + + + + + Browse... + + + + + + + DSi ARM9 BIOS: + + + + + + + Browse... + + + + + + + <html><head/><body><p>DSi-mode ARM7 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> + + + + + + + <html><head/><body><p>DSi-mode firmware (used for DS-mode backwards compatibility)</p><p><br/></p><p>Size should be 128 KB</p></body></html> + + + + + + + DSi ARM7 BIOS: + + + + + + + DSi firmware: + + + + + + + Browse... + + + + + + + + 0 + 0 + + + + <html><head/><body><p>DSi-mode ARM9 BIOS</p><p><br/></p><p>Size should be 64 KB</p></body></html> + + + + + + + DSi NAND: + + + + + + + <html><head/><body><p>DSi NAND dump</p><p><br/></p><p>Should have 'nocash footer' at the end</p></body></html> + + + + + + + Browse... + + + + + + + + + + + CPU Emulation + + + + + + Enable JIT recompiler + + + + + + + Maximum JIT block size: + + + + + + + 1 + + + 32 + + + 32 + + + + + + + Branch Optimisations + + + + + + + Literal Optimisations + + + + + + + Fast Memory + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + @@ -275,6 +368,27 @@ + + tabWidget + cbxConsoleType + chkDirectBoot + txtBIOS9Path + txtBIOS7Path + txtFirmwarePath + txtDSiBIOS9Path + txtDSiBIOS7Path + txtDSiFirmwarePath + txtDSiNANDPath + btnBIOS9Browse + btnBIOS7Browse + btnFirmwareBrowse + btnDSiBIOS9Browse + btnDSiBIOS7Browse + btnDSiFirmwareBrowse + btnDSiNANDBrowse + chkEnableJIT + spnJITMaximumBlockSize + @@ -284,8 +398,8 @@ accept() - 248 - 254 + 257 + 349 157 @@ -300,8 +414,8 @@ reject() - 316 - 260 + 325 + 349 286 diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp index fa542ad..4557d0e 100644 --- a/src/frontend/qt_sdl/main.cpp +++ b/src/frontend/qt_sdl/main.cpp @@ -1641,7 +1641,14 @@ void MainWindow::onStop() void MainWindow::onOpenEmuSettings() { - EmuSettingsDialog::openDlg(this); + EmuSettingsDialog* dlg = EmuSettingsDialog::openDlg(this); + connect(dlg, &EmuSettingsDialog::finished, this, &MainWindow::onEmuSettingsDialogFinished); +} + +void MainWindow::onEmuSettingsDialogFinished(int res) +{ + if (RunningSomething) + onReset(); } void MainWindow::onOpenInputConfig() diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h index 279aed8..eec2a48 100644 --- a/src/frontend/qt_sdl/main.h +++ b/src/frontend/qt_sdl/main.h @@ -199,6 +199,7 @@ private slots: void onStop(); void onOpenEmuSettings(); + void onEmuSettingsDialogFinished(int res); void onOpenInputConfig(); void onInputConfigFinished(int res); void onOpenVideoSettings(); diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp deleted file mode 100644 index 0df9c6c..0000000 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - Copyright 2016-2020 Arisotura - - This file is part of melonDS. - - melonDS is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation, either version 3 of the License, or (at your option) - any later version. - - melonDS is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with melonDS. If not, see http://www.gnu.org/licenses/. -*/ - -#include -#include - -#include "libui/ui.h" - -#include "../types.h" -#include "PlatformConfig.h" - -#include "DlgEmuSettings.h" - - -void ApplyNewSettings(int type); - -extern bool RunningSomething; - -namespace DlgEmuSettings -{ - -bool opened; -uiWindow* win; - -uiCheckbox* cbDirectBoot; - -#ifdef JIT_ENABLED -uiCheckbox* cbJITEnabled; -uiEntry* enJITMaxBlockSize; -uiCheckbox* cbJITBranchOptimisations; -uiCheckbox* cbJITLiteralOptimisations; -#endif - -int OnCloseWindow(uiWindow* window, void* blarg) -{ - opened = false; - return 1; -} - -void OnCancel(uiButton* btn, void* blarg) -{ - uiControlDestroy(uiControl(win)); - opened = false; -} - -void OnOk(uiButton* btn, void* blarg) -{ -#ifdef JIT_ENABLED - bool restart = false; - - bool enableJit = uiCheckboxChecked(cbJITEnabled); - char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); - long blockSize = strtol(maxBlockSizeStr, NULL, 10); - bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); - bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations); - uiFreeText(maxBlockSizeStr); - if (blockSize < 1) - blockSize = 1; - if (blockSize > 32) - blockSize = 32; - - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize - || branchOptimisations != Config::JIT_BrancheOptimisations - || literalOptimisations != Config::JIT_LiteralOptimisations) - { - if (RunningSomething && - !uiMsgBoxConfirm(win, "Reset emulator", - "Changing JIT settings requires a reset.\n\nDo you want to continue?")) - return; - - Config::JIT_Enable = enableJit; - Config::JIT_MaxBlockSize = blockSize; - Config::JIT_BrancheOptimisations = branchOptimisations; - Config::JIT_LiteralOptimisations = literalOptimisations; - - restart = true; - } -#endif - - Config::DirectBoot = uiCheckboxChecked(cbDirectBoot); - - Config::Save(); - - uiControlDestroy(uiControl(win)); - opened = false; - -#ifdef JIT_ENABLED - if (restart) - ApplyNewSettings(4); -#endif -} - -#ifdef JIT_ENABLED -void OnJITStateChanged(uiCheckbox* cb, void* blarg) -{ - if (uiCheckboxChecked(cb)) - { - uiControlEnable(uiControl(enJITMaxBlockSize)); - uiControlEnable(uiControl(cbJITBranchOptimisations)); - uiControlEnable(uiControl(cbJITLiteralOptimisations)); - } - else - { - uiControlDisable(uiControl(enJITMaxBlockSize)); - uiControlDisable(uiControl(cbJITBranchOptimisations)); - uiControlDisable(uiControl(cbJITLiteralOptimisations)); - } -} -#endif - -void Open() -{ - if (opened) - { - uiControlSetFocus(uiControl(win)); - return; - } - - opened = true; - win = uiNewWindow("Emu settings - melonDS", 300, 50, 0, 0, 0); - uiWindowSetMargined(win, 1); - uiWindowOnClosing(win, OnCloseWindow, NULL); - - uiBox* top = uiNewVerticalBox(); - uiWindowSetChild(win, uiControl(top)); - - { - uiBox* in_ctrl = uiNewVerticalBox(); - uiBoxAppend(top, uiControl(in_ctrl), 0); - - cbDirectBoot = uiNewCheckbox("Boot game directly"); - uiBoxAppend(in_ctrl, uiControl(cbDirectBoot), 0); - } - -#ifdef JIT_ENABLED - { - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(top, uiControl(dummy), 0); - } - - { - uiGroup* grp = uiNewGroup("JIT"); - uiBoxAppend(top, uiControl(grp), 1); - - uiBox* in_ctrl = uiNewVerticalBox(); - uiGroupSetChild(grp, uiControl(in_ctrl)); - - cbJITEnabled = uiNewCheckbox("Enable JIT recompiler"); - uiBoxAppend(in_ctrl, uiControl(cbJITEnabled), 0); - - uiCheckboxOnToggled(cbJITEnabled, OnJITStateChanged, NULL); - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - uiLabel* lbl = uiNewLabel("Maximum block size (1-32): "); - uiBoxAppend(row, uiControl(lbl), 0); - - enJITMaxBlockSize = uiNewEntry(); - uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:"); - uiBoxAppend(row, uiControl(lbl), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations"); - uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); - } - - { - uiBox* row = uiNewHorizontalBox(); - uiBoxAppend(in_ctrl, uiControl(row), 0); - - cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations"); - uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0); - } - } -#endif - - { - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(top, uiControl(dummy), 0); - } - - { - uiBox* in_ctrl = uiNewHorizontalBox(); - uiBoxSetPadded(in_ctrl, 1); - uiBoxAppend(top, uiControl(in_ctrl), 0); - - uiLabel* dummy = uiNewLabel(""); - uiBoxAppend(in_ctrl, uiControl(dummy), 1); - - uiButton* btncancel = uiNewButton("Cancel"); - uiButtonOnClicked(btncancel, OnCancel, NULL); - uiBoxAppend(in_ctrl, uiControl(btncancel), 0); - - uiButton* btnok = uiNewButton("Ok"); - uiButtonOnClicked(btnok, OnOk, NULL); - uiBoxAppend(in_ctrl, uiControl(btnok), 0); - } - - uiCheckboxSetChecked(cbDirectBoot, Config::DirectBoot); - -#ifdef JIT_ENABLED - uiCheckboxSetChecked(cbJITEnabled, Config::JIT_Enable); - { - char maxBlockSizeStr[10]; - sprintf(maxBlockSizeStr, "%d", Config::JIT_MaxBlockSize); - uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); - } - OnJITStateChanged(cbJITEnabled, NULL); - - uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); - uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations); -#endif - - uiControlShow(uiControl(win)); -} - -void Close() -{ - if (!opened) return; - uiControlDestroy(uiControl(win)); - opened = false; -} - -} diff --git a/src/libui_sdl/libui/ui.h b/src/libui_sdl/libui/ui.h deleted file mode 100644 index e45fe91..0000000 --- a/src/libui_sdl/libui/ui.h +++ /dev/null @@ -1,764 +0,0 @@ -// 6 april 2015 - -// TODO add a uiVerifyControlType() function that can be used by control implementations to verify controls - -#ifndef __LIBUI_UI_H__ -#define __LIBUI_UI_H__ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// this macro is generated by cmake -#ifdef libui_EXPORTS -#ifdef _WIN32 -#define _UI_EXTERN __declspec(dllexport) extern -#else -#define _UI_EXTERN __attribute__((visibility("default"))) extern -#endif -#else -// TODO add __declspec(dllimport) on windows, but only if not static -#define _UI_EXTERN extern -#endif - -// C++ is really really really really really really dumb about enums, so screw that and just make them anonymous -// This has the advantage of being ABI-able should we ever need an ABI... -#define _UI_ENUM(s) typedef unsigned int s; enum - -// This constant is provided because M_PI is nonstandard. -// This comes from Go's math.Pi, which in turn comes from http://oeis.org/A000796. -#define uiPi 3.14159265358979323846264338327950288419716939937510582097494459 - -// TODO uiBool? - -typedef struct uiInitOptions uiInitOptions; - -struct uiInitOptions { - size_t Size; -}; - -_UI_EXTERN const char *uiInit(uiInitOptions *options); -_UI_EXTERN void uiUninit(void); -_UI_EXTERN void uiFreeInitError(const char *err); - -_UI_EXTERN void uiMain(void); -_UI_EXTERN void uiMainSteps(void); -_UI_EXTERN int uiMainStep(int wait); -_UI_EXTERN void uiQuit(void); - -_UI_EXTERN void uiQueueMain(void (*f)(void *data), void *data); - -_UI_EXTERN void uiOnShouldQuit(int (*f)(void *data), void *data); - -_UI_EXTERN void uiFreeText(char *text); - -typedef struct uiControl uiControl; - -struct uiControl { - uint32_t Signature; - uint32_t OSSignature; - uint32_t TypeSignature; - void (*Destroy)(uiControl *); - uintptr_t (*Handle)(uiControl *); - uiControl *(*Parent)(uiControl *); - void (*SetParent)(uiControl *, uiControl *); - int (*Toplevel)(uiControl *); - int (*Visible)(uiControl *); - void (*Show)(uiControl *); - void (*Hide)(uiControl *); - int (*Enabled)(uiControl *); - void (*Enable)(uiControl *); - void (*Disable)(uiControl *); - void (*SetFocus)(uiControl *); - void (*SetMinSize)(uiControl*, int, int); - - int MinWidth, MinHeight; - - void* UserData; -}; -// TOOD add argument names to all arguments -#define uiControl(this) ((uiControl *) (this)) -_UI_EXTERN void uiControlDestroy(uiControl *); -_UI_EXTERN uintptr_t uiControlHandle(uiControl *); -_UI_EXTERN uiControl *uiControlParent(uiControl *); -_UI_EXTERN void uiControlSetParent(uiControl *, uiControl *); -_UI_EXTERN int uiControlToplevel(uiControl *); -_UI_EXTERN int uiControlVisible(uiControl *); -_UI_EXTERN void uiControlShow(uiControl *); -_UI_EXTERN void uiControlHide(uiControl *); -_UI_EXTERN int uiControlEnabled(uiControl *); -_UI_EXTERN void uiControlEnable(uiControl *); -_UI_EXTERN void uiControlDisable(uiControl *); -_UI_EXTERN void uiControlSetFocus(uiControl *); -_UI_EXTERN void uiControlSetMinSize(uiControl *, int w, int h); // -1 = no minimum - -_UI_EXTERN uiControl *uiAllocControl(size_t n, uint32_t OSsig, uint32_t typesig, const char *typenamestr); -_UI_EXTERN void uiFreeControl(uiControl *); - -// TODO make sure all controls have these -_UI_EXTERN void uiControlVerifySetParent(uiControl *, uiControl *); -_UI_EXTERN int uiControlEnabledToUser(uiControl *); - -_UI_EXTERN void uiUserBugCannotSetParentOnToplevel(const char *type); - -typedef struct uiWindow uiWindow; -#define uiWindow(this) ((uiWindow *) (this)) -_UI_EXTERN char *uiWindowTitle(uiWindow *w); -_UI_EXTERN void uiWindowSetTitle(uiWindow *w, const char *title); -_UI_EXTERN void uiWindowPosition(uiWindow *w, int *x, int *y); -_UI_EXTERN void uiWindowSetPosition(uiWindow *w, int x, int y); -_UI_EXTERN void uiWindowContentSize(uiWindow *w, int *width, int *height); -_UI_EXTERN void uiWindowSetContentSize(uiWindow *w, int width, int height); -_UI_EXTERN int uiWindowMinimized(uiWindow *w); -_UI_EXTERN void uiWindowSetMinimized(uiWindow *w, int minimized); -_UI_EXTERN int uiWindowMaximized(uiWindow *w); -_UI_EXTERN void uiWindowSetMaximized(uiWindow *w, int maximized); -_UI_EXTERN int uiWindowFullscreen(uiWindow *w); -_UI_EXTERN void uiWindowSetFullscreen(uiWindow *w, int fullscreen); -_UI_EXTERN int uiWindowBorderless(uiWindow *w); -_UI_EXTERN void uiWindowSetBorderless(uiWindow *w, int borderless); -_UI_EXTERN void uiWindowSetChild(uiWindow *w, uiControl *child); -_UI_EXTERN int uiWindowMargined(uiWindow *w); -_UI_EXTERN void uiWindowSetMargined(uiWindow *w, int margined); -_UI_EXTERN void uiWindowSetDropTarget(uiWindow* w, int drop); -_UI_EXTERN uiWindow *uiNewWindow(const char *title, int width, int height, int maximized, int hasMenubar, int resizable); - -_UI_EXTERN void uiWindowOnContentSizeChanged(uiWindow *w, void (*f)(uiWindow *, void *), void *data); -_UI_EXTERN void uiWindowOnClosing(uiWindow *w, int (*f)(uiWindow *w, void *data), void *data); -_UI_EXTERN void uiWindowOnDropFile(uiWindow *w, void (*f)(uiWindow *w, char *file, void *data), void *data); -_UI_EXTERN void uiWindowOnGetFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data); -_UI_EXTERN void uiWindowOnLoseFocus(uiWindow *w, void (*f)(uiWindow *w, void *data), void *data); - -typedef struct uiButton uiButton; -#define uiButton(this) ((uiButton *) (this)) -_UI_EXTERN char *uiButtonText(uiButton *b); -_UI_EXTERN void uiButtonSetText(uiButton *b, const char *text); -_UI_EXTERN void uiButtonOnClicked(uiButton *b, void (*f)(uiButton *b, void *data), void *data); -_UI_EXTERN uiButton *uiNewButton(const char *text); - -typedef struct uiBox uiBox; -#define uiBox(this) ((uiBox *) (this)) -_UI_EXTERN void uiBoxAppend(uiBox *b, uiControl *child, int stretchy); -_UI_EXTERN void uiBoxDelete(uiBox *b, int index); -_UI_EXTERN int uiBoxPadded(uiBox *b); -_UI_EXTERN void uiBoxSetPadded(uiBox *b, int padded); -_UI_EXTERN uiBox *uiNewHorizontalBox(void); -_UI_EXTERN uiBox *uiNewVerticalBox(void); - -typedef struct uiCheckbox uiCheckbox; -#define uiCheckbox(this) ((uiCheckbox *) (this)) -_UI_EXTERN char *uiCheckboxText(uiCheckbox *c); -_UI_EXTERN void uiCheckboxSetText(uiCheckbox *c, const char *text); -_UI_EXTERN void uiCheckboxOnToggled(uiCheckbox *c, void (*f)(uiCheckbox *c, void *data), void *data); -_UI_EXTERN int uiCheckboxChecked(uiCheckbox *c); -_UI_EXTERN void uiCheckboxSetChecked(uiCheckbox *c, int checked); -_UI_EXTERN uiCheckbox *uiNewCheckbox(const char *text); - -typedef struct uiEntry uiEntry; -#define uiEntry(this) ((uiEntry *) (this)) -_UI_EXTERN char *uiEntryText(uiEntry *e); -_UI_EXTERN void uiEntrySetText(uiEntry *e, const char *text); -_UI_EXTERN void uiEntryOnChanged(uiEntry *e, void (*f)(uiEntry *e, void *data), void *data); -_UI_EXTERN int uiEntryReadOnly(uiEntry *e); -_UI_EXTERN void uiEntrySetReadOnly(uiEntry *e, int readonly); -_UI_EXTERN uiEntry *uiNewEntry(void); -_UI_EXTERN uiEntry *uiNewPasswordEntry(void); -_UI_EXTERN uiEntry *uiNewSearchEntry(void); - -typedef struct uiLabel uiLabel; -#define uiLabel(this) ((uiLabel *) (this)) -_UI_EXTERN char *uiLabelText(uiLabel *l); -_UI_EXTERN void uiLabelSetText(uiLabel *l, const char *text); -_UI_EXTERN uiLabel *uiNewLabel(const char *text); - -typedef struct uiTab uiTab; -#define uiTab(this) ((uiTab *) (this)) -_UI_EXTERN void uiTabAppend(uiTab *t, const char *name, uiControl *c); -_UI_EXTERN void uiTabInsertAt(uiTab *t, const char *name, int before, uiControl *c); -_UI_EXTERN void uiTabDelete(uiTab *t, int index); -_UI_EXTERN int uiTabNumPages(uiTab *t); -_UI_EXTERN int uiTabMargined(uiTab *t, int page); -_UI_EXTERN void uiTabSetMargined(uiTab *t, int page, int margined); -_UI_EXTERN uiTab *uiNewTab(void); - -typedef struct uiGroup uiGroup; -#define uiGroup(this) ((uiGroup *) (this)) -_UI_EXTERN char *uiGroupTitle(uiGroup *g); -_UI_EXTERN void uiGroupSetTitle(uiGroup *g, const char *title); -_UI_EXTERN void uiGroupSetChild(uiGroup *g, uiControl *c); -_UI_EXTERN int uiGroupMargined(uiGroup *g); -_UI_EXTERN void uiGroupSetMargined(uiGroup *g, int margined); -_UI_EXTERN uiGroup *uiNewGroup(const char *title); - -// spinbox/slider rules: -// setting value outside of range will automatically clamp -// initial value is minimum -// complaint if min >= max? - -typedef struct uiSpinbox uiSpinbox; -#define uiSpinbox(this) ((uiSpinbox *) (this)) -_UI_EXTERN int uiSpinboxValue(uiSpinbox *s); -_UI_EXTERN void uiSpinboxSetValue(uiSpinbox *s, int value); -_UI_EXTERN void uiSpinboxOnChanged(uiSpinbox *s, void (*f)(uiSpinbox *s, void *data), void *data); -_UI_EXTERN uiSpinbox *uiNewSpinbox(int min, int max); - -typedef struct uiSlider uiSlider; -#define uiSlider(this) ((uiSlider *) (this)) -_UI_EXTERN int uiSliderValue(uiSlider *s); -_UI_EXTERN void uiSliderSetValue(uiSlider *s, int value); -_UI_EXTERN void uiSliderOnChanged(uiSlider *s, void (*f)(uiSlider *s, void *data), void *data); -_UI_EXTERN uiSlider *uiNewSlider(int min, int max); - -typedef struct uiProgressBar uiProgressBar; -#define uiProgressBar(this) ((uiProgressBar *) (this)) -_UI_EXTERN int uiProgressBarValue(uiProgressBar *p); -_UI_EXTERN void uiProgressBarSetValue(uiProgressBar *p, int n); -_UI_EXTERN uiProgressBar *uiNewProgressBar(void); - -typedef struct uiSeparator uiSeparator; -#define uiSeparator(this) ((uiSeparator *) (this)) -_UI_EXTERN uiSeparator *uiNewHorizontalSeparator(void); -_UI_EXTERN uiSeparator *uiNewVerticalSeparator(void); - -typedef struct uiCombobox uiCombobox; -#define uiCombobox(this) ((uiCombobox *) (this)) -_UI_EXTERN void uiComboboxAppend(uiCombobox *c, const char *text); -_UI_EXTERN int uiComboboxSelected(uiCombobox *c); -_UI_EXTERN void uiComboboxSetSelected(uiCombobox *c, int n); -_UI_EXTERN void uiComboboxOnSelected(uiCombobox *c, void (*f)(uiCombobox *c, void *data), void *data); -_UI_EXTERN uiCombobox *uiNewCombobox(void); - -typedef struct uiEditableCombobox uiEditableCombobox; -#define uiEditableCombobox(this) ((uiEditableCombobox *) (this)) -_UI_EXTERN void uiEditableComboboxAppend(uiEditableCombobox *c, const char *text); -_UI_EXTERN char *uiEditableComboboxText(uiEditableCombobox *c); -_UI_EXTERN void uiEditableComboboxSetText(uiEditableCombobox *c, const char *text); -// TODO what do we call a function that sets the currently selected item and fills the text field with it? editable comboboxes have no consistent concept of selected item -_UI_EXTERN void uiEditableComboboxOnChanged(uiEditableCombobox *c, void (*f)(uiEditableCombobox *c, void *data), void *data); -_UI_EXTERN uiEditableCombobox *uiNewEditableCombobox(void); - -typedef struct uiRadioButtons uiRadioButtons; -#define uiRadioButtons(this) ((uiRadioButtons *) (this)) -_UI_EXTERN void uiRadioButtonsAppend(uiRadioButtons *r, const char *text); -_UI_EXTERN int uiRadioButtonsSelected(uiRadioButtons *r); -_UI_EXTERN void uiRadioButtonsSetSelected(uiRadioButtons *r, int n); -_UI_EXTERN void uiRadioButtonsOnSelected(uiRadioButtons *r, void (*f)(uiRadioButtons *, void *), void *data); -_UI_EXTERN uiRadioButtons *uiNewRadioButtons(void); - -typedef struct uiDateTimePicker uiDateTimePicker; -#define uiDateTimePicker(this) ((uiDateTimePicker *) (this)) -_UI_EXTERN uiDateTimePicker *uiNewDateTimePicker(void); -_UI_EXTERN uiDateTimePicker *uiNewDatePicker(void); -_UI_EXTERN uiDateTimePicker *uiNewTimePicker(void); - -// TODO provide a facility for entering tab stops? -typedef struct uiMultilineEntry uiMultilineEntry; -#define uiMultilineEntry(this) ((uiMultilineEntry *) (this)) -_UI_EXTERN char *uiMultilineEntryText(uiMultilineEntry *e); -_UI_EXTERN void uiMultilineEntrySetText(uiMultilineEntry *e, const char *text); -_UI_EXTERN void uiMultilineEntryAppend(uiMultilineEntry *e, const char *text); -_UI_EXTERN void uiMultilineEntryOnChanged(uiMultilineEntry *e, void (*f)(uiMultilineEntry *e, void *data), void *data); -_UI_EXTERN int uiMultilineEntryReadOnly(uiMultilineEntry *e); -_UI_EXTERN void uiMultilineEntrySetReadOnly(uiMultilineEntry *e, int readonly); -_UI_EXTERN uiMultilineEntry *uiNewMultilineEntry(void); -_UI_EXTERN uiMultilineEntry *uiNewNonWrappingMultilineEntry(void); - -typedef struct uiMenuItem uiMenuItem; -#define uiMenuItem(this) ((uiMenuItem *) (this)) -_UI_EXTERN void uiMenuItemEnable(uiMenuItem *m); -_UI_EXTERN void uiMenuItemDisable(uiMenuItem *m); -_UI_EXTERN void uiMenuItemOnClicked(uiMenuItem *m, void (*f)(uiMenuItem *sender, uiWindow *window, void *data), void *data); -_UI_EXTERN int uiMenuItemChecked(uiMenuItem *m); -_UI_EXTERN void uiMenuItemSetChecked(uiMenuItem *m, int checked); - -typedef struct uiMenu uiMenu; -#define uiMenu(this) ((uiMenu *) (this)) -_UI_EXTERN uiMenuItem *uiMenuAppendItem(uiMenu *m, const char *name); -_UI_EXTERN uiMenuItem *uiMenuAppendCheckItem(uiMenu *m, const char *name); -_UI_EXTERN uiMenuItem *uiMenuAppendQuitItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendPreferencesItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendAboutItem(uiMenu *m); -_UI_EXTERN uiMenuItem *uiMenuAppendSubmenu(uiMenu *m, uiMenu* child); -_UI_EXTERN void uiMenuAppendSeparator(uiMenu *m); -_UI_EXTERN uiMenu *uiNewMenu(const char *name); - -_UI_EXTERN char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath); -_UI_EXTERN char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath); -_UI_EXTERN void uiMsgBox(uiWindow *parent, const char *title, const char *description); -_UI_EXTERN void uiMsgBoxError(uiWindow *parent, const char *title, const char *description); -_UI_EXTERN int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description); - -typedef struct uiArea uiArea; -typedef struct uiAreaHandler uiAreaHandler; -typedef struct uiAreaDrawParams uiAreaDrawParams; -typedef struct uiAreaMouseEvent uiAreaMouseEvent; -typedef struct uiAreaKeyEvent uiAreaKeyEvent; - -typedef struct uiDrawContext uiDrawContext; - -// TO CONSIDER: the uiAreaHandler param there seems useless -// (might use individual callbacks instead of handler struct?) -struct uiAreaHandler { - void (*Draw)(uiAreaHandler *, uiArea *, uiAreaDrawParams *); - // TODO document that resizes cause a full redraw for non-scrolling areas; implementation-defined for scrolling areas - void (*MouseEvent)(uiAreaHandler *, uiArea *, uiAreaMouseEvent *); - // TODO document that on first show if the mouse is already in the uiArea then one gets sent with left=0 - // TODO what about when the area is hidden and then shown again? - void (*MouseCrossed)(uiAreaHandler *, uiArea *, int left); - void (*DragBroken)(uiAreaHandler *, uiArea *); - int (*KeyEvent)(uiAreaHandler *, uiArea *, uiAreaKeyEvent *); - void (*Resize)(uiAreaHandler *, uiArea *, int, int); -}; - -// TODO RTL layouts? -// TODO reconcile edge and corner naming -_UI_ENUM(uiWindowResizeEdge) { - uiWindowResizeEdgeLeft, - uiWindowResizeEdgeTop, - uiWindowResizeEdgeRight, - uiWindowResizeEdgeBottom, - uiWindowResizeEdgeTopLeft, - uiWindowResizeEdgeTopRight, - uiWindowResizeEdgeBottomLeft, - uiWindowResizeEdgeBottomRight, - // TODO have one for keyboard resizes? - // TODO GDK doesn't seem to have any others, including for keyboards... - // TODO way to bring up the system menu instead? -}; - -#define uiGLVersion(major, minor) ((major) | ((minor)<<16)) -#define uiGLVerMajor(ver) ((ver) & 0xFFFF) -#define uiGLVerMinor(ver) ((ver) >> 16) - -#define uiArea(this) ((uiArea *) (this)) -// TODO give a better name -// TODO document the types of width and height -_UI_EXTERN void uiAreaSetSize(uiArea *a, int width, int height); -// TODO uiAreaQueueRedraw() -_UI_EXTERN void uiAreaQueueRedrawAll(uiArea *a); -_UI_EXTERN void uiAreaScrollTo(uiArea *a, double x, double y, double width, double height); -// TODO document these can only be called within Mouse() handlers -// TODO should these be allowed on scrolling areas? -// TODO decide which mouse events should be accepted; Down is the only one guaranteed to work right now -// TODO what happens to events after calling this up to and including the next mouse up? -// TODO release capture? -_UI_EXTERN void uiAreaBeginUserWindowMove(uiArea *a); -_UI_EXTERN void uiAreaBeginUserWindowResize(uiArea *a, uiWindowResizeEdge edge); -_UI_EXTERN void uiAreaSetBackgroundColor(uiArea *a, int r, int g, int b); -_UI_EXTERN uiArea *uiNewArea(uiAreaHandler *ah); -_UI_EXTERN uiArea *uiNewGLArea(uiAreaHandler *ah, const unsigned int* req_versions); -_UI_EXTERN uiArea *uiNewScrollingArea(uiAreaHandler *ah, int width, int height); - -struct uiAreaDrawParams { - uiDrawContext *Context; - - // TODO document that this is only defined for nonscrolling areas - double AreaWidth; - double AreaHeight; - - double ClipX; - double ClipY; - double ClipWidth; - double ClipHeight; -}; - -typedef struct uiDrawPath uiDrawPath; -typedef struct uiDrawBrush uiDrawBrush; -typedef struct uiDrawStrokeParams uiDrawStrokeParams; -typedef struct uiDrawMatrix uiDrawMatrix; - -typedef struct uiDrawBrushGradientStop uiDrawBrushGradientStop; - -typedef struct uiDrawBitmap uiDrawBitmap; - -_UI_ENUM(uiDrawBrushType) { - uiDrawBrushTypeSolid, - uiDrawBrushTypeLinearGradient, - uiDrawBrushTypeRadialGradient, - uiDrawBrushTypeImage, -}; - -_UI_ENUM(uiDrawLineCap) { - uiDrawLineCapFlat, - uiDrawLineCapRound, - uiDrawLineCapSquare, -}; - -_UI_ENUM(uiDrawLineJoin) { - uiDrawLineJoinMiter, - uiDrawLineJoinRound, - uiDrawLineJoinBevel, -}; - -// this is the default for botoh cairo and Direct2D (in the latter case, from the C++ helper functions) -// Core Graphics doesn't explicitly specify a default, but NSBezierPath allows you to choose one, and this is the initial value -// so we're good to use it too! -#define uiDrawDefaultMiterLimit 10.0 - -_UI_ENUM(uiDrawFillMode) { - uiDrawFillModeWinding, - uiDrawFillModeAlternate, -}; - -struct uiDrawMatrix { - double M11; - double M12; - double M21; - double M22; - double M31; - double M32; -}; - -struct uiDrawBrush { - uiDrawBrushType Type; - - // solid brushes - double R; - double G; - double B; - double A; - - // gradient brushes - double X0; // linear: start X, radial: start X - double Y0; // linear: start Y, radial: start Y - double X1; // linear: end X, radial: outer circle center X - double Y1; // linear: end Y, radial: outer circle center Y - double OuterRadius; // radial gradients only - uiDrawBrushGradientStop *Stops; - size_t NumStops; - // TODO extend mode - // cairo: none, repeat, reflect, pad; no individual control - // Direct2D: repeat, reflect, pad; no individual control - // Core Graphics: none, pad; before and after individually - // TODO cairo documentation is inconsistent about pad - - // TODO images - - // TODO transforms -}; - -struct uiDrawBrushGradientStop { - double Pos; - double R; - double G; - double B; - double A; -}; - -struct uiDrawStrokeParams { - uiDrawLineCap Cap; - uiDrawLineJoin Join; - // TODO what if this is 0? on windows there will be a crash with dashing - double Thickness; - double MiterLimit; - double *Dashes; - // TOOD what if this is 1 on Direct2D? - // TODO what if a dash is 0 on Cairo or Quartz? - size_t NumDashes; - double DashPhase; -}; - -struct uiRect { - int X; - int Y; - int Width; - int Height; -}; - -typedef struct uiRect uiRect; - -_UI_EXTERN uiDrawPath *uiDrawNewPath(uiDrawFillMode fillMode); -_UI_EXTERN void uiDrawFreePath(uiDrawPath *p); - -_UI_EXTERN void uiDrawPathNewFigure(uiDrawPath *p, double x, double y); -_UI_EXTERN void uiDrawPathNewFigureWithArc(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative); -_UI_EXTERN void uiDrawPathLineTo(uiDrawPath *p, double x, double y); -// notes: angles are both relative to 0 and go counterclockwise -// TODO is the initial line segment on cairo and OS X a proper join? -// TODO what if sweep < 0? -_UI_EXTERN void uiDrawPathArcTo(uiDrawPath *p, double xCenter, double yCenter, double radius, double startAngle, double sweep, int negative); -_UI_EXTERN void uiDrawPathBezierTo(uiDrawPath *p, double c1x, double c1y, double c2x, double c2y, double endX, double endY); -// TODO quadratic bezier -_UI_EXTERN void uiDrawPathCloseFigure(uiDrawPath *p); - -// TODO effect of these when a figure is already started -_UI_EXTERN void uiDrawPathAddRectangle(uiDrawPath *p, double x, double y, double width, double height); - -_UI_EXTERN void uiDrawPathEnd(uiDrawPath *p); - -_UI_EXTERN void uiDrawStroke(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b, uiDrawStrokeParams *p); -_UI_EXTERN void uiDrawFill(uiDrawContext *c, uiDrawPath *path, uiDrawBrush *b); - -// TODO primitives: -// - rounded rectangles -// - elliptical arcs -// - quadratic bezier curves - -_UI_EXTERN void uiDrawMatrixSetIdentity(uiDrawMatrix *m); -_UI_EXTERN void uiDrawMatrixTranslate(uiDrawMatrix *m, double x, double y); -_UI_EXTERN void uiDrawMatrixScale(uiDrawMatrix *m, double xCenter, double yCenter, double x, double y); -_UI_EXTERN void uiDrawMatrixRotate(uiDrawMatrix *m, double x, double y, double amount); -_UI_EXTERN void uiDrawMatrixSkew(uiDrawMatrix *m, double x, double y, double xamount, double yamount); -_UI_EXTERN void uiDrawMatrixMultiply(uiDrawMatrix *dest, uiDrawMatrix *src); -_UI_EXTERN int uiDrawMatrixInvertible(uiDrawMatrix *m); -_UI_EXTERN int uiDrawMatrixInvert(uiDrawMatrix *m); -_UI_EXTERN void uiDrawMatrixTransformPoint(uiDrawMatrix *m, double *x, double *y); -_UI_EXTERN void uiDrawMatrixTransformSize(uiDrawMatrix *m, double *x, double *y); - -_UI_EXTERN void uiDrawTransform(uiDrawContext *c, uiDrawMatrix *m); - -// TODO add a uiDrawPathStrokeToFill() or something like that -_UI_EXTERN void uiDrawClip(uiDrawContext *c, uiDrawPath *path); - -_UI_EXTERN void uiDrawSave(uiDrawContext *c); -_UI_EXTERN void uiDrawRestore(uiDrawContext *c); - -// bitmap API -_UI_EXTERN uiDrawBitmap* uiDrawNewBitmap(uiDrawContext* c, int width, int height, int alpha); -_UI_EXTERN void uiDrawBitmapUpdate(uiDrawBitmap* bmp, const void* data); -_UI_EXTERN void uiDrawBitmapDraw(uiDrawContext* c, uiDrawBitmap* bmp, uiRect* srcrect, uiRect* dstrect, int filter); -_UI_EXTERN void uiDrawFreeBitmap(uiDrawBitmap* bmp); - -// TODO manage the use of Text, Font, and TextFont, and of the uiDrawText prefix in general - -///// TODO reconsider this -typedef struct uiDrawFontFamilies uiDrawFontFamilies; - -_UI_EXTERN uiDrawFontFamilies *uiDrawListFontFamilies(void); -_UI_EXTERN int uiDrawFontFamiliesNumFamilies(uiDrawFontFamilies *ff); -_UI_EXTERN char *uiDrawFontFamiliesFamily(uiDrawFontFamilies *ff, int n); -_UI_EXTERN void uiDrawFreeFontFamilies(uiDrawFontFamilies *ff); -///// END TODO - -typedef struct uiDrawTextLayout uiDrawTextLayout; -typedef struct uiDrawTextFont uiDrawTextFont; -typedef struct uiDrawTextFontDescriptor uiDrawTextFontDescriptor; -typedef struct uiDrawTextFontMetrics uiDrawTextFontMetrics; - -_UI_ENUM(uiDrawTextWeight) { - uiDrawTextWeightThin, - uiDrawTextWeightUltraLight, - uiDrawTextWeightLight, - uiDrawTextWeightBook, - uiDrawTextWeightNormal, - uiDrawTextWeightMedium, - uiDrawTextWeightSemiBold, - uiDrawTextWeightBold, - uiDrawTextWeightUltraBold, - uiDrawTextWeightHeavy, - uiDrawTextWeightUltraHeavy, -}; - -_UI_ENUM(uiDrawTextItalic) { - uiDrawTextItalicNormal, - uiDrawTextItalicOblique, - uiDrawTextItalicItalic, -}; - -_UI_ENUM(uiDrawTextStretch) { - uiDrawTextStretchUltraCondensed, - uiDrawTextStretchExtraCondensed, - uiDrawTextStretchCondensed, - uiDrawTextStretchSemiCondensed, - uiDrawTextStretchNormal, - uiDrawTextStretchSemiExpanded, - uiDrawTextStretchExpanded, - uiDrawTextStretchExtraExpanded, - uiDrawTextStretchUltraExpanded, -}; - -struct uiDrawTextFontDescriptor { - const char *Family; - double Size; - uiDrawTextWeight Weight; - uiDrawTextItalic Italic; - uiDrawTextStretch Stretch; -}; - -struct uiDrawTextFontMetrics { - double Ascent; - double Descent; - double Leading; - // TODO do these two mean the same across all platforms? - double UnderlinePos; - double UnderlineThickness; -}; - -_UI_EXTERN uiDrawTextFont *uiDrawLoadClosestFont(const uiDrawTextFontDescriptor *desc); -_UI_EXTERN void uiDrawFreeTextFont(uiDrawTextFont *font); -_UI_EXTERN uintptr_t uiDrawTextFontHandle(uiDrawTextFont *font); -_UI_EXTERN void uiDrawTextFontDescribe(uiDrawTextFont *font, uiDrawTextFontDescriptor *desc); -// TODO make copy with given attributes methods? -// TODO yuck this name -_UI_EXTERN void uiDrawTextFontGetMetrics(uiDrawTextFont *font, uiDrawTextFontMetrics *metrics); - -// TODO initial line spacing? and what about leading? -_UI_EXTERN uiDrawTextLayout *uiDrawNewTextLayout(const char *text, uiDrawTextFont *defaultFont, double width); -_UI_EXTERN void uiDrawFreeTextLayout(uiDrawTextLayout *layout); -// TODO get width -_UI_EXTERN void uiDrawTextLayoutSetWidth(uiDrawTextLayout *layout, double width); -_UI_EXTERN void uiDrawTextLayoutExtents(uiDrawTextLayout *layout, double *width, double *height); - -// and the attributes that you can set on a text layout -_UI_EXTERN void uiDrawTextLayoutSetColor(uiDrawTextLayout *layout, int startChar, int endChar, double r, double g, double b, double a); - -_UI_EXTERN void uiDrawText(uiDrawContext *c, double x, double y, uiDrawTextLayout *layout); - - -// OpenGL support - -typedef struct uiGLContext uiGLContext; - -_UI_EXTERN uiGLContext *uiAreaGetGLContext(uiArea* a); -_UI_EXTERN void uiGLMakeContextCurrent(uiGLContext* ctx); -_UI_EXTERN void uiGLBegin(uiGLContext* ctx); -_UI_EXTERN void uiGLEnd(uiGLContext* ctx); -_UI_EXTERN unsigned int uiGLGetVersion(uiGLContext* ctx); -_UI_EXTERN void *uiGLGetProcAddress(const char* proc); -_UI_EXTERN int uiGLGetFramebuffer(uiGLContext* ctx); -_UI_EXTERN float uiGLGetFramebufferScale(uiGLContext* ctx); -_UI_EXTERN void uiGLSwapBuffers(uiGLContext* ctx); -_UI_EXTERN void uiGLSetVSync(int sync); - - -_UI_ENUM(uiModifiers) { - uiModifierCtrl = 1 << 0, - uiModifierAlt = 1 << 1, - uiModifierShift = 1 << 2, - uiModifierSuper = 1 << 3, -}; - -// TODO document drag captures -struct uiAreaMouseEvent { - // TODO document what these mean for scrolling areas - double X; - double Y; - - // TODO see draw above - double AreaWidth; - double AreaHeight; - - int Down; - int Up; - - int Count; - - uiModifiers Modifiers; - - uint64_t Held1To64; -}; - -_UI_ENUM(uiExtKey) { - uiExtKeyEscape = 1, - uiExtKeyInsert, // equivalent to "Help" on Apple keyboards - uiExtKeyDelete, - uiExtKeyHome, - uiExtKeyEnd, - uiExtKeyPageUp, - uiExtKeyPageDown, - uiExtKeyUp, - uiExtKeyDown, - uiExtKeyLeft, - uiExtKeyRight, - uiExtKeyF1, // F1..F12 are guaranteed to be consecutive - uiExtKeyF2, - uiExtKeyF3, - uiExtKeyF4, - uiExtKeyF5, - uiExtKeyF6, - uiExtKeyF7, - uiExtKeyF8, - uiExtKeyF9, - uiExtKeyF10, - uiExtKeyF11, - uiExtKeyF12, - uiExtKeyN0, // numpad keys; independent of Num Lock state - uiExtKeyN1, // N0..N9 are guaranteed to be consecutive - uiExtKeyN2, - uiExtKeyN3, - uiExtKeyN4, - uiExtKeyN5, - uiExtKeyN6, - uiExtKeyN7, - uiExtKeyN8, - uiExtKeyN9, - uiExtKeyNDot, - uiExtKeyNEnter, - uiExtKeyNAdd, - uiExtKeyNSubtract, - uiExtKeyNMultiply, - uiExtKeyNDivide, -}; - -struct uiAreaKeyEvent { - char Key; - uiExtKey ExtKey; - uiModifiers Modifier; - - uiModifiers Modifiers; - - // additional things - int Scancode; // bit0-7: scancode, bit8: ext flag - - int Up; - int Repeat; -}; - -typedef struct uiFontButton uiFontButton; -#define uiFontButton(this) ((uiFontButton *) (this)) -// TODO document this returns a new font -_UI_EXTERN uiDrawTextFont *uiFontButtonFont(uiFontButton *b); -// TOOD SetFont, mechanics -_UI_EXTERN void uiFontButtonOnChanged(uiFontButton *b, void (*f)(uiFontButton *, void *), void *data); -_UI_EXTERN uiFontButton *uiNewFontButton(void); - -typedef struct uiColorButton uiColorButton; -#define uiColorButton(this) ((uiColorButton *) (this)) -_UI_EXTERN void uiColorButtonColor(uiColorButton *b, double *r, double *g, double *bl, double *a); -_UI_EXTERN void uiColorButtonSetColor(uiColorButton *b, double r, double g, double bl, double a); -_UI_EXTERN void uiColorButtonOnChanged(uiColorButton *b, void (*f)(uiColorButton *, void *), void *data); -_UI_EXTERN uiColorButton *uiNewColorButton(void); - -typedef struct uiForm uiForm; -#define uiForm(this) ((uiForm *) (this)) -_UI_EXTERN void uiFormAppend(uiForm *f, const char *label, uiControl *c, int stretchy); -_UI_EXTERN void uiFormDelete(uiForm *f, int index); -_UI_EXTERN int uiFormPadded(uiForm *f); -_UI_EXTERN void uiFormSetPadded(uiForm *f, int padded); -_UI_EXTERN uiForm *uiNewForm(void); - -_UI_ENUM(uiAlign) { - uiAlignFill, - uiAlignStart, - uiAlignCenter, - uiAlignEnd, -}; - -_UI_ENUM(uiAt) { - uiAtLeading, - uiAtTop, - uiAtTrailing, - uiAtBottom, -}; - -typedef struct uiGrid uiGrid; -#define uiGrid(this) ((uiGrid *) (this)) -_UI_EXTERN void uiGridAppend(uiGrid *g, uiControl *c, int left, int top, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign); -_UI_EXTERN void uiGridInsertAt(uiGrid *g, uiControl *c, uiControl *existing, uiAt at, int xspan, int yspan, int hexpand, uiAlign halign, int vexpand, uiAlign valign); -_UI_EXTERN int uiGridPadded(uiGrid *g); -_UI_EXTERN void uiGridSetPadded(uiGrid *g, int padded); -_UI_EXTERN uiGrid *uiNewGrid(void); - - -// misc. - -_UI_EXTERN char* uiKeyName(int scancode); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/libui_sdl/libui/unix/stddialogs.c b/src/libui_sdl/libui/unix/stddialogs.c deleted file mode 100644 index 10c598d..0000000 --- a/src/libui_sdl/libui/unix/stddialogs.c +++ /dev/null @@ -1,126 +0,0 @@ -// 26 june 2015 -#include "uipriv_unix.h" - -// LONGTERM figure out why, and describe, that this is the desired behavior -// LONGTERM also point out that font and color buttons also work like this - -#define windowWindow(w) ((w)?(GTK_WINDOW(uiControlHandle(uiControl(w)))):NULL) - -static char *filedialog(GtkWindow *parent, GtkFileChooserAction mode, const gchar *confirm, const char* filter, const char* initpath) -{ - GtkWidget *fcd; - GtkFileChooser *fc; - gint response; - char *filename; - - fcd = gtk_file_chooser_dialog_new(NULL, parent, mode, - "_Cancel", GTK_RESPONSE_CANCEL, - confirm, GTK_RESPONSE_ACCEPT, - NULL); - fc = GTK_FILE_CHOOSER(fcd); - - // filters - { - gchar _filter[256]; - gchar* fp = &_filter[0]; int s = 0; - gchar* fname; - for (int i = 0; i < 255; i++) - { - if (filter[i] == '|' || filter[i] == '\0') - { - _filter[i] = '\0'; - if (s & 1) - { - GtkFileFilter* filter = gtk_file_filter_new(); - gtk_file_filter_set_name(filter, fname); - - for (gchar* j = fp; ; j++) - { - if (*j == ';') - { - *j = '\0'; - gtk_file_filter_add_pattern(filter, fp); - fp = j+1; - } - else if (*j == '\0') - { - gtk_file_filter_add_pattern(filter, fp); - break; - } - } - - gtk_file_chooser_add_filter(fc, filter); - } - else - { - fname = fp; - } - fp = &_filter[i+1]; - s++; - if (s >= 8) break; - if (filter[i] == '\0') break; - } - else - _filter[i] = filter[i]; - } - } - - gtk_file_chooser_set_local_only(fc, FALSE); - gtk_file_chooser_set_select_multiple(fc, FALSE); - gtk_file_chooser_set_show_hidden(fc, TRUE); - gtk_file_chooser_set_do_overwrite_confirmation(fc, TRUE); - gtk_file_chooser_set_create_folders(fc, TRUE); - if (initpath && strlen(initpath)>0) - gtk_file_chooser_set_current_folder(fc, initpath); - - response = gtk_dialog_run(GTK_DIALOG(fcd)); - if (response != GTK_RESPONSE_ACCEPT) { - gtk_widget_destroy(fcd); - return NULL; - } - filename = uiUnixStrdupText(gtk_file_chooser_get_filename(fc)); - gtk_widget_destroy(fcd); - return filename; -} - -char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath) -{ - return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_OPEN, "_Open", filter, initpath); -} - -char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath) -{ - return filedialog(windowWindow(parent), GTK_FILE_CHOOSER_ACTION_SAVE, "_Save", filter, initpath); -} - -static int msgbox(GtkWindow *parent, const char *title, const char *description, GtkMessageType type, GtkButtonsType buttons) -{ - GtkWidget *md; - - md = gtk_message_dialog_new(parent, GTK_DIALOG_MODAL, - type, buttons, - "%s", title); - gtk_message_dialog_format_secondary_text(GTK_MESSAGE_DIALOG(md), "%s", description); - int result = gtk_dialog_run(GTK_DIALOG(md)); - gtk_widget_destroy(md); - - return result; -} - -void uiMsgBox(uiWindow *parent, const char *title, const char *description) -{ - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_OTHER, GTK_BUTTONS_OK); -} - -void uiMsgBoxError(uiWindow *parent, const char *title, const char *description) -{ - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_ERROR, GTK_BUTTONS_OK); -} - -int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description) -{ - int result = - msgbox(windowWindow(parent), title, description, GTK_MESSAGE_QUESTION, GTK_BUTTONS_OK_CANCEL); - - return result == GTK_RESPONSE_OK; -} \ No newline at end of file diff --git a/src/libui_sdl/libui/windows/stddialogs.cpp b/src/libui_sdl/libui/windows/stddialogs.cpp deleted file mode 100644 index 7537015..0000000 --- a/src/libui_sdl/libui/windows/stddialogs.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// 22 may 2015 -#include "uipriv_windows.hpp" - -// TODO document all this is what we want -// TODO do the same for font and color buttons - -// notes: -// - FOS_SUPPORTSTREAMABLEITEMS doesn't seem to be supported on windows vista, or at least not with the flags we use -// - even with FOS_NOVALIDATE the dialogs will reject invalid filenames (at least on Vista, anyway) -// - lack of FOS_NOREADONLYRETURN doesn't seem to matter on Windows 7 - -// TODO -// - http://blogs.msdn.com/b/wpfsdk/archive/2006/10/26/uncommon-dialogs--font-chooser-and-color-picker-dialogs.aspx -// - when a dialog is active, tab navigation in other windows stops working -// - when adding uiOpenFolder(), use IFileDialog as well - https://msdn.microsoft.com/en-us/library/windows/desktop/bb762115%28v=vs.85%29.aspx - -#define windowHWND(w) (w ? (HWND) uiControlHandle(uiControl(w)) : NULL) - -char *commonItemDialog(HWND parent, REFCLSID clsid, REFIID iid, const char* filter, const char* initpath, FILEOPENDIALOGOPTIONS optsadd) -{ - IFileDialog *d = NULL; - FILEOPENDIALOGOPTIONS opts; - IShellItem *result = NULL; - WCHAR *wname = NULL; - char *name = NULL; - HRESULT hr; - - hr = CoCreateInstance(clsid, - NULL, CLSCTX_INPROC_SERVER, - iid, (LPVOID *) (&d)); - if (hr != S_OK) { - logHRESULT(L"error creating common item dialog", hr); - // always return NULL on error - goto out; - } - hr = d->GetOptions(&opts); - if (hr != S_OK) { - logHRESULT(L"error getting current options", hr); - goto out; - } - opts |= optsadd; - // the other platforms don't check read-only; we won't either - opts &= ~FOS_NOREADONLYRETURN; - hr = d->SetOptions(opts); - if (hr != S_OK) { - logHRESULT(L"error setting options", hr); - goto out; - } - - // filters - { - COMDLG_FILTERSPEC filterspec[8]; - wchar_t _filter[256]; - wchar_t* fp = &_filter[0]; int s = 0; - wchar_t* fname; - for (int i = 0; i < 255; i++) - { - if (filter[i] == '|' || filter[i] == '\0') - { - _filter[i] = '\0'; - if (s & 1) - { - filterspec[s>>1].pszName = fname; - filterspec[s>>1].pszSpec = fp; - } - else - { - fname = fp; - } - fp = &_filter[i+1]; - s++; - if (s >= 8) break; - if (filter[i] == '\0') break; - } - else - _filter[i] = filter[i]; - } - d->SetFileTypes(s>>1, filterspec); - } - - hr = d->Show(parent); - if (hr == HRESULT_FROM_WIN32(ERROR_CANCELLED)) - // cancelled; return NULL like we have ready - goto out; - if (hr != S_OK) { - logHRESULT(L"error showing dialog", hr); - goto out; - } - hr = d->GetResult(&result); - if (hr != S_OK) { - logHRESULT(L"error getting dialog result", hr); - goto out; - } - hr = result->GetDisplayName(SIGDN_FILESYSPATH, &wname); - if (hr != S_OK) { - logHRESULT(L"error getting filename", hr); - goto out; - } - name = toUTF8(wname); - -out: - if (wname != NULL) - CoTaskMemFree(wname); - if (result != NULL) - result->Release(); - if (d != NULL) - d->Release(); - return name; -} - -char *uiOpenFile(uiWindow *parent, const char* filter, const char* initpath) -{ - char *res; - - disableAllWindowsExcept(parent); - res = commonItemDialog(windowHWND(parent), - CLSID_FileOpenDialog, IID_IFileOpenDialog, - filter, initpath, - FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_PATHMUSTEXIST | FOS_FILEMUSTEXIST | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE); - enableAllWindowsExcept(parent); - return res; -} - -char *uiSaveFile(uiWindow *parent, const char* filter, const char* initpath) -{ - char *res; - - disableAllWindowsExcept(parent); - res = commonItemDialog(windowHWND(parent), - CLSID_FileSaveDialog, IID_IFileSaveDialog, - filter, initpath, - FOS_OVERWRITEPROMPT | FOS_NOCHANGEDIR | FOS_FORCEFILESYSTEM | FOS_NOVALIDATE | FOS_SHAREAWARE | FOS_NOTESTFILECREATE | FOS_FORCESHOWHIDDEN | FOS_DEFAULTNOMINIMODE); - enableAllWindowsExcept(parent); - return res; -} - -// TODO switch to TaskDialogIndirect()? - -static int msgbox(HWND parent, const char *title, const char *description, TASKDIALOG_COMMON_BUTTON_FLAGS buttons, PCWSTR icon) -{ - WCHAR *wtitle, *wdescription; - HRESULT hr; - - wtitle = toUTF16(title); - wdescription = toUTF16(description); - - int result; - hr = TaskDialog(parent, NULL, NULL, wtitle, wdescription, buttons, icon, &result); - if (hr != S_OK) - logHRESULT(L"error showing task dialog", hr); - - uiFree(wdescription); - uiFree(wtitle); - - return result; -} - -void uiMsgBox(uiWindow *parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, NULL); - enableAllWindowsExcept(parent); -} - -void uiMsgBoxError(uiWindow *parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON, TD_ERROR_ICON); - enableAllWindowsExcept(parent); -} - -int uiMsgBoxConfirm(uiWindow * parent, const char *title, const char *description) -{ - disableAllWindowsExcept(parent); - int result = - msgbox(windowHWND(parent), title, description, TDCBF_OK_BUTTON | TDCBF_CANCEL_BUTTON, TD_WARNING_ICON); - enableAllWindowsExcept(parent); - - return result == IDOK; -} \ No newline at end of file diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp deleted file mode 100644 index 0066668..0000000 --- a/src/libui_sdl/main.cpp +++ /dev/null @@ -1,3061 +0,0 @@ -/* - Copyright 2016-2020 Arisotura - - This file is part of melonDS. - - melonDS is free software: you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation, either version 3 of the License, or (at your option) - any later version. - - melonDS is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with melonDS. If not, see http://www.gnu.org/licenses/. -*/ - -#include -#include -#include -#include - -#ifndef __WIN32__ -#include -#endif - -#include -#include "libui/ui.h" - -#include "../OpenGLSupport.h" -#include "main_shaders.h" - -#include "../types.h" -#include "../version.h" -#include "PlatformConfig.h" - -#include "DlgEmuSettings.h" -#include "DlgInputConfig.h" -#include "DlgVideoSettings.h" -#include "DlgAudioSettings.h" -#include "DlgWifiSettings.h" - -#include "../NDS.h" -#include "../GBACart.h" -#include "../GPU.h" -#include "../SPU.h" -#include "../Wifi.h" -#include "../Platform.h" -#include "../Config.h" -#include "../ARMJIT.h" - -#include "../Savestate.h" - -#include "OSD.h" - -#ifdef MELONCAP -#include "MelonCap.h" -#endif // MELONCAP - - -// savestate slot mapping -// 1-8: regular slots (quick access) -// '9': load/save arbitrary file -const int kSavestateNum[9] = {1, 2, 3, 4, 5, 6, 7, 8, 0}; - -const int kScreenSize[4] = {1, 2, 3, 4}; -const int kScreenRot[4] = {0, 1, 2, 3}; -const int kScreenGap[6] = {0, 1, 8, 64, 90, 128}; -const int kScreenLayout[3] = {0, 1, 2}; -const int kScreenSizing[4] = {0, 1, 2, 3}; - - -char* EmuDirectory; - - -uiWindow* MainWindow; -uiArea* MainDrawArea; -uiAreaHandler MainDrawAreaHandler; - -const u32 kGLVersions[] = {uiGLVersion(3,2), uiGLVersion(3,1), 0}; -uiGLContext* GLContext; - -int WindowWidth, WindowHeight; - -uiMenuItem* MenuItem_SaveState; -uiMenuItem* MenuItem_LoadState; -uiMenuItem* MenuItem_UndoStateLoad; - -uiMenuItem* MenuItem_SaveStateSlot[9]; -uiMenuItem* MenuItem_LoadStateSlot[9]; - -uiMenuItem* MenuItem_Pause; -uiMenuItem* MenuItem_Reset; -uiMenuItem* MenuItem_Stop; - -uiMenuItem* MenuItem_SavestateSRAMReloc; - -uiMenuItem* MenuItem_ScreenRot[4]; -uiMenuItem* MenuItem_ScreenGap[6]; -uiMenuItem* MenuItem_ScreenLayout[3]; -uiMenuItem* MenuItem_ScreenSizing[4]; - -uiMenuItem* MenuItem_ScreenFilter; -uiMenuItem* MenuItem_LimitFPS; -uiMenuItem* MenuItem_AudioSync; -uiMenuItem* MenuItem_ShowOSD; - -SDL_Thread* EmuThread; -int EmuRunning; -volatile int EmuStatus; - -bool RunningSomething; -char ROMPath[2][1024]; -char SRAMPath[2][1024]; -char PrevSRAMPath[2][1024]; // for savestate 'undo load' - -bool SavestateLoaded; - -bool Screen_UseGL; - -bool ScreenDrawInited = false; -uiDrawBitmap* ScreenBitmap[2] = {NULL,NULL}; - -GLuint GL_ScreenShader[3]; -GLuint GL_ScreenShaderAccel[3]; -GLuint GL_ScreenShaderOSD[3]; -struct -{ - float uScreenSize[2]; - u32 u3DScale; - u32 uFilterMode; - -} GL_ShaderConfig; -GLuint GL_ShaderConfigUBO; -GLuint GL_ScreenVertexArrayID, GL_ScreenVertexBufferID; -float GL_ScreenVertices[2 * 3*2 * 4]; // position/texcoord -GLuint GL_ScreenTexture; -bool GL_ScreenSizeDirty; - -int GL_3DScale; - -bool GL_VSyncStatus; - -int ScreenGap = 0; -int ScreenLayout = 0; -int ScreenSizing = 0; -int ScreenRotation = 0; - -int MainScreenPos[3]; -int AutoScreenSizing; - -uiRect TopScreenRect; -uiRect BottomScreenRect; -uiDrawMatrix TopScreenTrans; -uiDrawMatrix BottomScreenTrans; - -bool Touching = false; - -u32 KeyInputMask, JoyInputMask; -u32 KeyHotkeyMask, JoyHotkeyMask; -u32 HotkeyMask, LastHotkeyMask; -u32 HotkeyPress, HotkeyRelease; - -#define HotkeyDown(hk) (HotkeyMask & (1<<(hk))) -#define HotkeyPressed(hk) (HotkeyPress & (1<<(hk))) -#define HotkeyReleased(hk) (HotkeyRelease & (1<<(hk))) - -bool LidStatus; - -int JoystickID; -SDL_Joystick* Joystick; - -int AudioFreq; -float AudioSampleFrac; -SDL_AudioDeviceID AudioDevice, MicDevice; - -SDL_cond* AudioSync; -SDL_mutex* AudioSyncLock; - -u32 MicBufferLength = 2048; -s16 MicBuffer[2048]; -u32 MicBufferReadPos, MicBufferWritePos; - -u32 MicWavLength; -s16* MicWavBuffer; - -void SetupScreenRects(int width, int height); - -void TogglePause(void* blarg); -void Reset(void* blarg); - -void SetupSRAMPath(int slot); - -void SaveState(int slot); -void LoadState(int slot); -void UndoStateLoad(); -void GetSavestateName(int slot, char* filename, int len); - -void CreateMainWindow(bool opengl); -void DestroyMainWindow(); -void RecreateMainWindow(bool opengl); - - - -bool GLScreen_InitShader(GLuint* shader, const char* fs) -{ - if (!OpenGL_BuildShaderProgram(kScreenVS, fs, shader, "ScreenShader")) - return false; - - glBindAttribLocation(shader[2], 0, "vPosition"); - glBindAttribLocation(shader[2], 1, "vTexcoord"); - glBindFragDataLocation(shader[2], 0, "oColor"); - - if (!OpenGL_LinkShaderProgram(shader)) - return false; - - GLuint uni_id; - - uni_id = glGetUniformBlockIndex(shader[2], "uConfig"); - glUniformBlockBinding(shader[2], uni_id, 16); - - glUseProgram(shader[2]); - uni_id = glGetUniformLocation(shader[2], "ScreenTex"); - glUniform1i(uni_id, 0); - uni_id = glGetUniformLocation(shader[2], "_3DTex"); - glUniform1i(uni_id, 1); - - return true; -} - -bool GLScreen_InitOSDShader(GLuint* shader) -{ - if (!OpenGL_BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, shader, "ScreenShaderOSD")) - return false; - - glBindAttribLocation(shader[2], 0, "vPosition"); - glBindFragDataLocation(shader[2], 0, "oColor"); - - if (!OpenGL_LinkShaderProgram(shader)) - return false; - - GLuint uni_id; - - uni_id = glGetUniformBlockIndex(shader[2], "uConfig"); - glUniformBlockBinding(shader[2], uni_id, 16); - - glUseProgram(shader[2]); - uni_id = glGetUniformLocation(shader[2], "OSDTex"); - glUniform1i(uni_id, 0); - - return true; -} - -bool GLScreen_Init() -{ - GL_VSyncStatus = Config::ScreenVSync; - - // TODO: consider using epoxy? - if (!OpenGL_Init()) - return false; - - const GLubyte* renderer = glGetString(GL_RENDERER); // get renderer string - const GLubyte* version = glGetString(GL_VERSION); // version as a string - printf("OpenGL: renderer: %s\n", renderer); - printf("OpenGL: version: %s\n", version); - - if (!GLScreen_InitShader(GL_ScreenShader, kScreenFS)) - return false; - if (!GLScreen_InitShader(GL_ScreenShaderAccel, kScreenFS_Accel)) - return false; - if (!GLScreen_InitOSDShader(GL_ScreenShaderOSD)) - return false; - - memset(&GL_ShaderConfig, 0, sizeof(GL_ShaderConfig)); - - glGenBuffers(1, &GL_ShaderConfigUBO); - glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO); - glBufferData(GL_UNIFORM_BUFFER, sizeof(GL_ShaderConfig), &GL_ShaderConfig, GL_STATIC_DRAW); - glBindBufferBase(GL_UNIFORM_BUFFER, 16, GL_ShaderConfigUBO); - - glGenBuffers(1, &GL_ScreenVertexBufferID); - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBufferData(GL_ARRAY_BUFFER, sizeof(GL_ScreenVertices), NULL, GL_STATIC_DRAW); - - glGenVertexArrays(1, &GL_ScreenVertexArrayID); - glBindVertexArray(GL_ScreenVertexArrayID); - glEnableVertexAttribArray(0); // position - glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(0)); - glEnableVertexAttribArray(1); // texcoord - glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 4*4, (void*)(2*4)); - - glGenTextures(1, &GL_ScreenTexture); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI, 256*3 + 1, 192*2, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, NULL); - - GL_ScreenSizeDirty = true; - - return true; -} - -void GLScreen_DeInit() -{ - glDeleteTextures(1, &GL_ScreenTexture); - - glDeleteVertexArrays(1, &GL_ScreenVertexArrayID); - glDeleteBuffers(1, &GL_ScreenVertexBufferID); - - OpenGL_DeleteShaderProgram(GL_ScreenShader); - OpenGL_DeleteShaderProgram(GL_ScreenShaderAccel); - OpenGL_DeleteShaderProgram(GL_ScreenShaderOSD); -} - -void GLScreen_DrawScreen() -{ - bool vsync = Config::ScreenVSync && !HotkeyDown(HK_FastForward); - if (vsync != GL_VSyncStatus) - { - GL_VSyncStatus = vsync; - uiGLSetVSync(vsync); - } - - float scale = uiGLGetFramebufferScale(GLContext); - - glBindFramebuffer(GL_FRAMEBUFFER, uiGLGetFramebuffer(GLContext)); - - if (GL_ScreenSizeDirty) - { - GL_ScreenSizeDirty = false; - - GL_ShaderConfig.uScreenSize[0] = WindowWidth; - GL_ShaderConfig.uScreenSize[1] = WindowHeight; - GL_ShaderConfig.u3DScale = GL_3DScale; - - glBindBuffer(GL_UNIFORM_BUFFER, GL_ShaderConfigUBO); - void* unibuf = glMapBuffer(GL_UNIFORM_BUFFER, GL_WRITE_ONLY); - if (unibuf) memcpy(unibuf, &GL_ShaderConfig, sizeof(GL_ShaderConfig)); - glUnmapBuffer(GL_UNIFORM_BUFFER); - - float scwidth, scheight; - - float x0, y0, x1, y1; - float s0, s1, s2, s3; - float t0, t1, t2, t3; - -#define SETVERTEX(i, x, y, s, t) \ - GL_ScreenVertices[4*(i) + 0] = x; \ - GL_ScreenVertices[4*(i) + 1] = y; \ - GL_ScreenVertices[4*(i) + 2] = s; \ - GL_ScreenVertices[4*(i) + 3] = t; - - x0 = TopScreenRect.X; - y0 = TopScreenRect.Y; - x1 = TopScreenRect.X + TopScreenRect.Width; - y1 = TopScreenRect.Y + TopScreenRect.Height; - - scwidth = 256; - scheight = 192; - - switch (ScreenRotation) - { - case 0: - s0 = 0; t0 = 0; - s1 = scwidth; t1 = 0; - s2 = 0; t2 = scheight; - s3 = scwidth; t3 = scheight; - break; - - case 1: - s0 = 0; t0 = scheight; - s1 = 0; t1 = 0; - s2 = scwidth; t2 = scheight; - s3 = scwidth; t3 = 0; - break; - - case 2: - s0 = scwidth; t0 = scheight; - s1 = 0; t1 = scheight; - s2 = scwidth; t2 = 0; - s3 = 0; t3 = 0; - break; - - case 3: - s0 = scwidth; t0 = 0; - s1 = scwidth; t1 = scheight; - s2 = 0; t2 = 0; - s3 = 0; t3 = scheight; - break; - } - - SETVERTEX(0, x0, y0, s0, t0); - SETVERTEX(1, x1, y1, s3, t3); - SETVERTEX(2, x1, y0, s1, t1); - SETVERTEX(3, x0, y0, s0, t0); - SETVERTEX(4, x0, y1, s2, t2); - SETVERTEX(5, x1, y1, s3, t3); - - x0 = BottomScreenRect.X; - y0 = BottomScreenRect.Y; - x1 = BottomScreenRect.X + BottomScreenRect.Width; - y1 = BottomScreenRect.Y + BottomScreenRect.Height; - - scwidth = 256; - scheight = 192; - - switch (ScreenRotation) - { - case 0: - s0 = 0; t0 = 192; - s1 = scwidth; t1 = 192; - s2 = 0; t2 = 192+scheight; - s3 = scwidth; t3 = 192+scheight; - break; - - case 1: - s0 = 0; t0 = 192+scheight; - s1 = 0; t1 = 192; - s2 = scwidth; t2 = 192+scheight; - s3 = scwidth; t3 = 192; - break; - - case 2: - s0 = scwidth; t0 = 192+scheight; - s1 = 0; t1 = 192+scheight; - s2 = scwidth; t2 = 192; - s3 = 0; t3 = 192; - break; - - case 3: - s0 = scwidth; t0 = 192; - s1 = scwidth; t1 = 192+scheight; - s2 = 0; t2 = 192; - s3 = 0; t3 = 192+scheight; - break; - } - - SETVERTEX(6, x0, y0, s0, t0); - SETVERTEX(7, x1, y1, s3, t3); - SETVERTEX(8, x1, y0, s1, t1); - SETVERTEX(9, x0, y0, s0, t0); - SETVERTEX(10, x0, y1, s2, t2); - SETVERTEX(11, x1, y1, s3, t3); - -#undef SETVERTEX - - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(GL_ScreenVertices), GL_ScreenVertices); - } - - glDisable(GL_DEPTH_TEST); - glDisable(GL_STENCIL_TEST); - glDisable(GL_BLEND); - glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - - glViewport(0, 0, WindowWidth*scale, WindowHeight*scale); - - if (GPU3D::Renderer == 0) - OpenGL_UseShaderProgram(GL_ScreenShader); - else - OpenGL_UseShaderProgram(GL_ScreenShaderAccel); - - glClearColor(0, 0, 0, 1); - glClear(GL_COLOR_BUFFER_BIT); - - if (RunningSomething) - { - int frontbuf = GPU::FrontBuffer; - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, GL_ScreenTexture); - - if (GPU::Framebuffer[frontbuf][0] && GPU::Framebuffer[frontbuf][1]) - { - if (GPU3D::Renderer == 0) - { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]); - } - else - { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]); - } - } - - glActiveTexture(GL_TEXTURE1); - if (GPU3D::Renderer != 0) - GPU3D::GLRenderer::SetupAccelFrame(); - - glBindBuffer(GL_ARRAY_BUFFER, GL_ScreenVertexBufferID); - glBindVertexArray(GL_ScreenVertexArrayID); - glDrawArrays(GL_TRIANGLES, 0, 4*3); - } - - OpenGL_UseShaderProgram(GL_ScreenShaderOSD); - OSD::Update(true, NULL); - - glFlush(); - uiGLSwapBuffers(GLContext); -} - -void MicLoadWav(char* name) -{ - SDL_AudioSpec format; - memset(&format, 0, sizeof(SDL_AudioSpec)); - - if (MicWavBuffer) delete[] MicWavBuffer; - MicWavBuffer = NULL; - MicWavLength = 0; - - u8* buf; - u32 len; - if (!SDL_LoadWAV(name, &format, &buf, &len)) - return; - - const u64 dstfreq = 44100; - - if (format.format == AUDIO_S16 || format.format == AUDIO_U16) - { - int srcinc = format.channels; - len /= (2 * srcinc); - - MicWavLength = (len * dstfreq) / format.freq; - if (MicWavLength < 735) MicWavLength = 735; - MicWavBuffer = new s16[MicWavLength]; - - float res_incr = len / (float)MicWavLength; - float res_timer = 0; - int res_pos = 0; - - for (int i = 0; i < MicWavLength; i++) - { - u16 val = ((u16*)buf)[res_pos]; - if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000; - - MicWavBuffer[i] = val; - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos += srcinc; - } - } - } - else if (format.format == AUDIO_S8 || format.format == AUDIO_U8) - { - int srcinc = format.channels; - len /= srcinc; - - MicWavLength = (len * dstfreq) / format.freq; - if (MicWavLength < 735) MicWavLength = 735; - MicWavBuffer = new s16[MicWavLength]; - - float res_incr = len / (float)MicWavLength; - float res_timer = 0; - int res_pos = 0; - - for (int i = 0; i < MicWavLength; i++) - { - u16 val = buf[res_pos] << 8; - if (SDL_AUDIO_ISUNSIGNED(format.format)) val ^= 0x8000; - - MicWavBuffer[i] = val; - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos += srcinc; - } - } - } - else - printf("bad WAV format %08X\n", format.format); - - SDL_FreeWAV(buf); -} - -void AudioCallback(void* data, Uint8* stream, int len) -{ - len /= (sizeof(s16) * 2); - - // resample incoming audio to match the output sample rate - - float f_len_in = (len * 32823.6328125) / (float)AudioFreq; - f_len_in += AudioSampleFrac; - int len_in = (int)floor(f_len_in); - AudioSampleFrac = f_len_in - len_in; - - s16 buf_in[1024*2]; - s16* buf_out = (s16*)stream; - - int num_in; - int num_out = len; - - SDL_LockMutex(AudioSyncLock); - num_in = SPU::ReadOutput(buf_in, len_in); - SDL_CondSignal(AudioSync); - SDL_UnlockMutex(AudioSyncLock); - - if (num_in < 1) - { - memset(stream, 0, len*sizeof(s16)*2); - return; - } - - int margin = 6; - if (num_in < len_in-margin) - { - int last = num_in-1; - if (last < 0) last = 0; - - for (int i = num_in; i < len_in-margin; i++) - ((u32*)buf_in)[i] = ((u32*)buf_in)[last]; - - num_in = len_in-margin; - } - - float res_incr = num_in / (float)num_out; - float res_timer = 0; - int res_pos = 0; - - int volume = Config::AudioVolume; - - for (int i = 0; i < len; i++) - { - buf_out[i*2 ] = (buf_in[res_pos*2 ] * volume) >> 8; - buf_out[i*2+1] = (buf_in[res_pos*2+1] * volume) >> 8; - - /*s16 s_l = buf_in[res_pos*2 ]; - s16 s_r = buf_in[res_pos*2+1]; - - float a = res_timer; - float b = 1.0 - a; - s_l = (s_l * a) + (buf_in[(res_pos-1)*2 ] * b); - s_r = (s_r * a) + (buf_in[(res_pos-1)*2+1] * b); - - buf_out[i*2 ] = (s_l * volume) >> 8; - buf_out[i*2+1] = (s_r * volume) >> 8;*/ - - res_timer += res_incr; - while (res_timer >= 1.0) - { - res_timer -= 1.0; - res_pos++; - } - } -} - -void MicCallback(void* data, Uint8* stream, int len) -{ - if (Config::MicInputType != 1) return; - - s16* input = (s16*)stream; - len /= sizeof(s16); - - if ((MicBufferWritePos + len) > MicBufferLength) - { - u32 len1 = MicBufferLength - MicBufferWritePos; - memcpy(&MicBuffer[MicBufferWritePos], &input[0], len1*sizeof(s16)); - memcpy(&MicBuffer[0], &input[len1], (len - len1)*sizeof(s16)); - MicBufferWritePos = len - len1; - } - else - { - memcpy(&MicBuffer[MicBufferWritePos], input, len*sizeof(s16)); - MicBufferWritePos += len; - } -} - -void FeedMicInput() -{ - int type = Config::MicInputType; - bool cmd = HotkeyDown(HK_Mic); - - if ((type != 1 && !cmd) || - (type == 1 && MicBufferLength == 0) || - (type == 3 && MicWavBuffer == NULL)) - { - type = 0; - MicBufferReadPos = 0; - } - - switch (type) - { - case 0: // no mic - NDS::MicInputFrame(NULL, 0); - break; - - case 1: // host mic - if ((MicBufferReadPos + 735) > MicBufferLength) - { - s16 tmp[735]; - u32 len1 = MicBufferLength - MicBufferReadPos; - memcpy(&tmp[0], &MicBuffer[MicBufferReadPos], len1*sizeof(s16)); - memcpy(&tmp[len1], &MicBuffer[0], (735 - len1)*sizeof(s16)); - - NDS::MicInputFrame(tmp, 735); - MicBufferReadPos = 735 - len1; - } - else - { - NDS::MicInputFrame(&MicBuffer[MicBufferReadPos], 735); - MicBufferReadPos += 735; - } - break; - - case 2: // white noise - { - s16 tmp[735]; - for (int i = 0; i < 735; i++) tmp[i] = rand() & 0xFFFF; - NDS::MicInputFrame(tmp, 735); - } - break; - - case 3: // WAV - if ((MicBufferReadPos + 735) > MicWavLength) - { - s16 tmp[735]; - u32 len1 = MicWavLength - MicBufferReadPos; - memcpy(&tmp[0], &MicWavBuffer[MicBufferReadPos], len1*sizeof(s16)); - memcpy(&tmp[len1], &MicWavBuffer[0], (735 - len1)*sizeof(s16)); - - NDS::MicInputFrame(tmp, 735); - MicBufferReadPos = 735 - len1; - } - else - { - NDS::MicInputFrame(&MicWavBuffer[MicBufferReadPos], 735); - MicBufferReadPos += 735; - } - break; - } -} - -void OpenJoystick() -{ - if (Joystick) SDL_JoystickClose(Joystick); - - int num = SDL_NumJoysticks(); - if (num < 1) - { - Joystick = NULL; - return; - } - - if (JoystickID >= num) - JoystickID = 0; - - Joystick = SDL_JoystickOpen(JoystickID); -} - -bool JoystickButtonDown(int val) -{ - if (val == -1) return false; - - bool hasbtn = ((val & 0xFFFF) != 0xFFFF); - - if (hasbtn) - { - if (val & 0x100) - { - int hatnum = (val >> 4) & 0xF; - int hatdir = val & 0xF; - Uint8 hatval = SDL_JoystickGetHat(Joystick, hatnum); - - bool pressed = false; - if (hatdir == 0x1) pressed = (hatval & SDL_HAT_UP); - else if (hatdir == 0x4) pressed = (hatval & SDL_HAT_DOWN); - else if (hatdir == 0x2) pressed = (hatval & SDL_HAT_RIGHT); - else if (hatdir == 0x8) pressed = (hatval & SDL_HAT_LEFT); - - if (pressed) return true; - } - else - { - int btnnum = val & 0xFFFF; - Uint8 btnval = SDL_JoystickGetButton(Joystick, btnnum); - - if (btnval) return true; - } - } - - if (val & 0x10000) - { - int axisnum = (val >> 24) & 0xF; - int axisdir = (val >> 20) & 0xF; - Sint16 axisval = SDL_JoystickGetAxis(Joystick, axisnum); - - switch (axisdir) - { - case 0: // positive - if (axisval > 16384) return true; - break; - - case 1: // negative - if (axisval < -16384) return true; - break; - - case 2: // trigger - if (axisval > 0) return true; - break; - } - } - - return false; -} - -void ProcessInput() -{ - SDL_JoystickUpdate(); - - if (Joystick) - { - if (!SDL_JoystickGetAttached(Joystick)) - { - SDL_JoystickClose(Joystick); - Joystick = NULL; - } - } - if (!Joystick && (SDL_NumJoysticks() > 0)) - { - JoystickID = Config::JoystickID; - OpenJoystick(); - } - - JoyInputMask = 0xFFF; - for (int i = 0; i < 12; i++) - if (JoystickButtonDown(Config::JoyMapping[i])) - JoyInputMask &= ~(1<> 4); - - bool pressed = false; - if (btnid == 0x101) // up - pressed = (hat & SDL_HAT_UP); - else if (btnid == 0x104) // down - pressed = (hat & SDL_HAT_DOWN); - else if (btnid == 0x102) // right - pressed = (hat & SDL_HAT_RIGHT); - else if (btnid == 0x108) // left - pressed = (hat & SDL_HAT_LEFT); - else if (btnid < njoybuttons) - pressed = (joybuttons[btnid] & ~(joybuttons[btnid] >> 1)) & 0x01; - - return pressed; -} - -bool JoyButtonHeld(int btnid, int njoybuttons, Uint8* joybuttons, Uint32 hat) -{ - if (btnid < 0) return false; - - bool pressed = false; - if (btnid == 0x101) // up - pressed = (hat & SDL_HAT_UP); - else if (btnid == 0x104) // down - pressed = (hat & SDL_HAT_DOWN); - else if (btnid == 0x102) // right - pressed = (hat & SDL_HAT_RIGHT); - else if (btnid == 0x108) // left - pressed = (hat & SDL_HAT_LEFT); - else if (btnid < njoybuttons) - pressed = joybuttons[btnid] & 0x01; - - return pressed; -} - -void UpdateWindowTitle(void* data) -{ - if (EmuStatus == 0) return; - void** dataarray = (void**)data; - SDL_LockMutex((SDL_mutex*)dataarray[1]); - uiWindowSetTitle(MainWindow, (const char*)dataarray[0]); - SDL_UnlockMutex((SDL_mutex*)dataarray[1]); -} - -void UpdateFPSLimit(void* data) -{ - uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1); -} - -int EmuThreadFunc(void* burp) -{ - NDS::Init(); - - MainScreenPos[0] = 0; - MainScreenPos[1] = 0; - MainScreenPos[2] = 0; - AutoScreenSizing = 0; - - if (Screen_UseGL) - { - uiGLMakeContextCurrent(GLContext); - GPU3D::InitRenderer(true); - uiGLMakeContextCurrent(NULL); - } - else - { - GPU3D::InitRenderer(false); - } - - Touching = false; - KeyInputMask = 0xFFF; - JoyInputMask = 0xFFF; - KeyHotkeyMask = 0; - JoyHotkeyMask = 0; - HotkeyMask = 0; - LastHotkeyMask = 0; - LidStatus = false; - - u32 nframes = 0; - u32 starttick = SDL_GetTicks(); - u32 lasttick = starttick; - u32 lastmeasuretick = lasttick; - u32 fpslimitcount = 0; - u64 perfcount = SDL_GetPerformanceCounter(); - u64 perffreq = SDL_GetPerformanceFrequency(); - float samplesleft = 0; - u32 nsamples = 0; - - char melontitle[100]; - SDL_mutex* titlemutex = SDL_CreateMutex(); - void* titledata[2] = {melontitle, titlemutex}; - - while (EmuRunning != 0) - { - ProcessInput(); - - if (HotkeyPressed(HK_FastForwardToggle)) - { - Config::LimitFPS = !Config::LimitFPS; - uiQueueMain(UpdateFPSLimit, NULL); - } - // TODO: similar hotkeys for video/audio sync? - - if (HotkeyPressed(HK_Pause)) uiQueueMain(TogglePause, NULL); - if (HotkeyPressed(HK_Reset)) uiQueueMain(Reset, NULL); - - if (GBACart::CartInserted && GBACart::HasSolarSensor) - { - if (HotkeyPressed(HK_SolarSensorDecrease)) - { - if (GBACart_SolarSensor::LightLevel > 0) GBACart_SolarSensor::LightLevel--; - char msg[64]; - sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel); - OSD::AddMessage(0, msg); - } - if (HotkeyPressed(HK_SolarSensorIncrease)) - { - if (GBACart_SolarSensor::LightLevel < 10) GBACart_SolarSensor::LightLevel++; - char msg[64]; - sprintf(msg, "Solar sensor level set to %d", GBACart_SolarSensor::LightLevel); - OSD::AddMessage(0, msg); - } - } - - if (EmuRunning == 1) - { - EmuStatus = 1; - - // process input and hotkeys - NDS::SetKeyMask(KeyInputMask & JoyInputMask); - - if (HotkeyPressed(HK_Lid)) - { - LidStatus = !LidStatus; - NDS::SetLidClosed(LidStatus); - OSD::AddMessage(0, LidStatus ? "Lid closed" : "Lid opened"); - } - - // microphone input - FeedMicInput(); - - if (Screen_UseGL) - { - uiGLBegin(GLContext); - uiGLMakeContextCurrent(GLContext); - } - - // auto screen layout - { - MainScreenPos[2] = MainScreenPos[1]; - MainScreenPos[1] = MainScreenPos[0]; - MainScreenPos[0] = NDS::PowerControl9 >> 15; - - int guess; - if (MainScreenPos[0] == MainScreenPos[2] && - MainScreenPos[0] != MainScreenPos[1]) - { - // constant flickering, likely displaying 3D on both screens - // TODO: when both screens are used for 2D only...??? - guess = 0; - } - else - { - if (MainScreenPos[0] == 1) - guess = 1; - else - guess = 2; - } - - if (guess != AutoScreenSizing) - { - AutoScreenSizing = guess; - SetupScreenRects(WindowWidth, WindowHeight); - } - } - - // emulate - u32 nlines = NDS::RunFrame(); - -#ifdef MELONCAP - MelonCap::Update(); -#endif // MELONCAP - - if (EmuRunning == 0) break; - - if (Screen_UseGL) - { - GLScreen_DrawScreen(); - uiGLEnd(GLContext); - } - uiAreaQueueRedrawAll(MainDrawArea); - - bool fastforward = HotkeyDown(HK_FastForward); - - if (Config::AudioSync && !fastforward) - { - SDL_LockMutex(AudioSyncLock); - while (SPU::GetOutputSize() > 1024) - { - int ret = SDL_CondWaitTimeout(AudioSync, AudioSyncLock, 500); - if (ret == SDL_MUTEX_TIMEDOUT) break; - } - SDL_UnlockMutex(AudioSyncLock); - } - else - { - // ensure the audio FIFO doesn't overflow - //SPU::TrimOutput(); - } - - float framerate = (1000.0f * nlines) / (60.0f * 263.0f); - - { - u32 curtick = SDL_GetTicks(); - u32 delay = curtick - lasttick; - - bool limitfps = Config::LimitFPS && !fastforward; - if (limitfps) - { - float wantedtickF = starttick + (framerate * (fpslimitcount+1)); - u32 wantedtick = (u32)ceil(wantedtickF); - if (curtick < wantedtick) SDL_Delay(wantedtick - curtick); - - lasttick = SDL_GetTicks(); - fpslimitcount++; - if ((abs(wantedtickF - (float)wantedtick) < 0.001312) || (fpslimitcount > 60)) - { - fpslimitcount = 0; - nsamples = 0; - starttick = lasttick; - } - } - else - { - if (delay < 1) SDL_Delay(1); - lasttick = SDL_GetTicks(); - } - } - - nframes++; - if (nframes >= 30) - { - u32 tick = SDL_GetTicks(); - u32 diff = tick - lastmeasuretick; - lastmeasuretick = tick; - - u32 fps; - if (diff < 1) fps = 77777; - else fps = (nframes * 1000) / diff; - nframes = 0; - - float fpstarget; - if (framerate < 1) fpstarget = 999; - else fpstarget = 1000.0f/framerate; - - SDL_LockMutex(titlemutex); - sprintf(melontitle, "[%d/%.0f] melonDS " MELONDS_VERSION, fps, fpstarget); - SDL_UnlockMutex(titlemutex); - uiQueueMain(UpdateWindowTitle, titledata); - } - } - else - { - // paused - nframes = 0; - lasttick = SDL_GetTicks(); - starttick = lasttick; - lastmeasuretick = lasttick; - fpslimitcount = 0; - - if (EmuRunning == 2) - { - if (Screen_UseGL) - { - uiGLBegin(GLContext); - uiGLMakeContextCurrent(GLContext); - GLScreen_DrawScreen(); - uiGLEnd(GLContext); - } - uiAreaQueueRedrawAll(MainDrawArea); - } - - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - EmuStatus = EmuRunning; - - SDL_Delay(100); - } - } - - EmuStatus = 0; - - SDL_DestroyMutex(titlemutex); - - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - - NDS::DeInit(); - Platform::LAN_DeInit(); - - if (Screen_UseGL) - { - OSD::DeInit(true); - GLScreen_DeInit(); - } - else - OSD::DeInit(false); - - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - return 44203; -} - -void StopEmuThread() -{ - EmuRunning = 0; - SDL_WaitThread(EmuThread, NULL); -} - - -void OnAreaDraw(uiAreaHandler* handler, uiArea* area, uiAreaDrawParams* params) -{ - if (!ScreenDrawInited) - { - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - ScreenDrawInited = true; - ScreenBitmap[0] = uiDrawNewBitmap(params->Context, 256, 192, 0); - ScreenBitmap[1] = uiDrawNewBitmap(params->Context, 256, 192, 0); - } - - int frontbuf = GPU::FrontBuffer; - if (!ScreenBitmap[0] || !ScreenBitmap[1]) return; - if (!GPU::Framebuffer[frontbuf][0] || !GPU::Framebuffer[frontbuf][1]) return; - - uiRect top = {0, 0, 256, 192}; - uiRect bot = {0, 0, 256, 192}; - - uiDrawBitmapUpdate(ScreenBitmap[0], GPU::Framebuffer[frontbuf][0]); - uiDrawBitmapUpdate(ScreenBitmap[1], GPU::Framebuffer[frontbuf][1]); - - uiDrawSave(params->Context); - uiDrawTransform(params->Context, &TopScreenTrans); - uiDrawBitmapDraw(params->Context, ScreenBitmap[0], &top, &TopScreenRect, Config::ScreenFilter==1); - uiDrawRestore(params->Context); - - uiDrawSave(params->Context); - uiDrawTransform(params->Context, &BottomScreenTrans); - uiDrawBitmapDraw(params->Context, ScreenBitmap[1], &bot, &BottomScreenRect, Config::ScreenFilter==1); - uiDrawRestore(params->Context); - - OSD::Update(false, params); -} - -void OnAreaMouseEvent(uiAreaHandler* handler, uiArea* area, uiAreaMouseEvent* evt) -{ - int x = (int)evt->X; - int y = (int)evt->Y; - - if (Touching && (evt->Up == 1)) - { - Touching = false; - NDS::ReleaseKey(16+6); - NDS::ReleaseScreen(); - } - else if (!Touching && (evt->Down == 1) && - (x >= BottomScreenRect.X) && (y >= BottomScreenRect.Y) && - (x < (BottomScreenRect.X+BottomScreenRect.Width)) && (y < (BottomScreenRect.Y+BottomScreenRect.Height))) - { - Touching = true; - NDS::PressKey(16+6); - } - - if (Touching) - { - x -= BottomScreenRect.X; - y -= BottomScreenRect.Y; - - if (ScreenRotation == 0 || ScreenRotation == 2) - { - if (BottomScreenRect.Width != 256) - x = (x * 256) / BottomScreenRect.Width; - if (BottomScreenRect.Height != 192) - y = (y * 192) / BottomScreenRect.Height; - - if (ScreenRotation == 2) - { - x = 255 - x; - y = 191 - y; - } - } - else - { - if (BottomScreenRect.Width != 192) - x = (x * 192) / BottomScreenRect.Width; - if (BottomScreenRect.Height != 256) - y = (y * 256) / BottomScreenRect.Height; - - if (ScreenRotation == 1) - { - int tmp = x; - x = y; - y = 191 - tmp; - } - else - { - int tmp = x; - x = 255 - y; - y = tmp; - } - } - - // clamp - if (x < 0) x = 0; - else if (x > 255) x = 255; - if (y < 0) y = 0; - else if (y > 191) y = 191; - - // TODO: take advantage of possible extra precision when possible? (scaled window for example) - NDS::TouchScreen(x, y); - } -} - -void OnAreaMouseCrossed(uiAreaHandler* handler, uiArea* area, int left) -{ -} - -void OnAreaDragBroken(uiAreaHandler* handler, uiArea* area) -{ -} - -bool EventMatchesKey(uiAreaKeyEvent* evt, int val, bool checkmod) -{ - if (val == -1) return false; - - int key = val & 0xFFFF; - int mod = val >> 16; - return evt->Scancode == key && (!checkmod || evt->Modifiers == mod); -} - -int OnAreaKeyEvent(uiAreaHandler* handler, uiArea* area, uiAreaKeyEvent* evt) -{ - // TODO: release all keys if the window loses focus? or somehow global key input? - if (evt->Scancode == 0x38) // ALT - return 0; - if (evt->Modifiers == 0x2) // ALT+key - return 0; - - if (evt->Up) - { - for (int i = 0; i < 12; i++) - if (EventMatchesKey(evt, Config::KeyMapping[i], false)) - KeyInputMask |= (1<Repeat) - { - // TODO, eventually: make savestate keys configurable? - // F keys: 3B-44, 57-58 | SHIFT: mod. 0x4 - if (evt->Scancode >= 0x3B && evt->Scancode <= 0x42) // F1-F8, quick savestate - { - if (evt->Modifiers == 0x4) SaveState(1 + (evt->Scancode - 0x3B)); - else if (evt->Modifiers == 0x0) LoadState(1 + (evt->Scancode - 0x3B)); - } - else if (evt->Scancode == 0x43) // F9, savestate from/to file - { - if (evt->Modifiers == 0x4) SaveState(0); - else if (evt->Modifiers == 0x0) LoadState(0); - } - else if (evt->Scancode == 0x58) // F12, undo savestate - { - if (evt->Modifiers == 0x0) UndoStateLoad(); - } - - for (int i = 0; i < 12; i++) - if (EventMatchesKey(evt, Config::KeyMapping[i], false)) - KeyInputMask &= ~(1<Scancode == 0x57) // F11 - // NDS::debug(0); - } - - return 1; -} - -void SetupScreenRects(int width, int height) -{ - bool horizontal = false; - bool sideways = false; - - if (ScreenRotation == 1 || ScreenRotation == 3) - sideways = true; - - if (ScreenLayout == 2) horizontal = true; - else if (ScreenLayout == 0) - { - if (sideways) - horizontal = true; - } - - int sizemode; - if (ScreenSizing == 3) - sizemode = AutoScreenSizing; - else - sizemode = ScreenSizing; - - int screenW, screenH, gap; - if (sideways) - { - screenW = 192; - screenH = 256; - } - else - { - screenW = 256; - screenH = 192; - } - - gap = ScreenGap; - - uiRect *topscreen, *bottomscreen; - if (ScreenRotation == 1 || ScreenRotation == 2) - { - topscreen = &BottomScreenRect; - bottomscreen = &TopScreenRect; - } - else - { - topscreen = &TopScreenRect; - bottomscreen = &BottomScreenRect; - } - - if (horizontal) - { - // side-by-side - - int heightreq; - int startX = 0; - - width -= gap; - - if (sizemode == 0) // even - { - heightreq = (width * screenH) / (screenW*2); - if (heightreq > height) - { - int newwidth = (height * width) / heightreq; - startX = (width - newwidth) / 2; - heightreq = height; - width = newwidth; - } - } - else // emph. top/bottom - { - heightreq = ((width - screenW) * screenH) / screenW; - if (heightreq > height) - { - int newwidth = ((height * (width - screenW)) / heightreq) + screenW; - startX = (width - newwidth) / 2; - heightreq = height; - width = newwidth; - } - } - - if (sizemode == 2) - { - topscreen->Width = screenW; - topscreen->Height = screenH; - } - else - { - topscreen->Width = (sizemode==0) ? (width / 2) : (width - screenW); - topscreen->Height = heightreq; - } - topscreen->X = startX; - topscreen->Y = ((height - heightreq) / 2) + (heightreq - topscreen->Height); - - bottomscreen->X = topscreen->X + topscreen->Width + gap; - - if (sizemode == 1) - { - bottomscreen->Width = screenW; - bottomscreen->Height = screenH; - } - else - { - bottomscreen->Width = width - topscreen->Width; - bottomscreen->Height = heightreq; - } - bottomscreen->Y = ((height - heightreq) / 2) + (heightreq - bottomscreen->Height); - } - else - { - // top then bottom - - int widthreq; - int startY = 0; - - height -= gap; - - if (sizemode == 0) // even - { - widthreq = (height * screenW) / (screenH*2); - if (widthreq > width) - { - int newheight = (width * height) / widthreq; - startY = (height - newheight) / 2; - widthreq = width; - height = newheight; - } - } - else // emph. top/bottom - { - widthreq = ((height - screenH) * screenW) / screenH; - if (widthreq > width) - { - int newheight = ((width * (height - screenH)) / widthreq) + screenH; - startY = (height - newheight) / 2; - widthreq = width; - height = newheight; - } - } - - if (sizemode == 2) - { - topscreen->Width = screenW; - topscreen->Height = screenH; - } - else - { - topscreen->Width = widthreq; - topscreen->Height = (sizemode==0) ? (height / 2) : (height - screenH); - } - topscreen->Y = startY; - topscreen->X = (width - topscreen->Width) / 2; - - bottomscreen->Y = topscreen->Y + topscreen->Height + gap; - - if (sizemode == 1) - { - bottomscreen->Width = screenW; - bottomscreen->Height = screenH; - } - else - { - bottomscreen->Width = widthreq; - bottomscreen->Height = height - topscreen->Height; - } - bottomscreen->X = (width - bottomscreen->Width) / 2; - } - - // setup matrices for potential rotation - - uiDrawMatrixSetIdentity(&TopScreenTrans); - uiDrawMatrixSetIdentity(&BottomScreenTrans); - - switch (ScreenRotation) - { - case 1: // 90° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI/2.0f); - uiDrawMatrixScale(&TopScreenTrans, 0, 0, - TopScreenRect.Width/(double)TopScreenRect.Height, - TopScreenRect.Height/(double)TopScreenRect.Width); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI/2.0f); - uiDrawMatrixScale(&BottomScreenTrans, 0, 0, - BottomScreenRect.Width/(double)BottomScreenRect.Height, - BottomScreenRect.Height/(double)BottomScreenRect.Width); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y); - } - break; - - case 2: // 180° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, M_PI); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X+TopScreenRect.Width, TopScreenRect.Y+TopScreenRect.Height); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, M_PI); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X+BottomScreenRect.Width, BottomScreenRect.Y+BottomScreenRect.Height); - } - break; - - case 3: // 270° - { - uiDrawMatrixTranslate(&TopScreenTrans, -TopScreenRect.X, -TopScreenRect.Y); - uiDrawMatrixRotate(&TopScreenTrans, 0, 0, -M_PI/2.0f); - uiDrawMatrixScale(&TopScreenTrans, 0, 0, - TopScreenRect.Width/(double)TopScreenRect.Height, - TopScreenRect.Height/(double)TopScreenRect.Width); - uiDrawMatrixTranslate(&TopScreenTrans, TopScreenRect.X, TopScreenRect.Y+TopScreenRect.Height); - - uiDrawMatrixTranslate(&BottomScreenTrans, -BottomScreenRect.X, -BottomScreenRect.Y); - uiDrawMatrixRotate(&BottomScreenTrans, 0, 0, -M_PI/2.0f); - uiDrawMatrixScale(&BottomScreenTrans, 0, 0, - BottomScreenRect.Width/(double)BottomScreenRect.Height, - BottomScreenRect.Height/(double)BottomScreenRect.Width); - uiDrawMatrixTranslate(&BottomScreenTrans, BottomScreenRect.X, BottomScreenRect.Y+BottomScreenRect.Height); - } - break; - } - - GL_ScreenSizeDirty = true; -} - -void SetMinSize(int w, int h) -{ - int cw, ch; - uiWindowContentSize(MainWindow, &cw, &ch); - - uiControlSetMinSize(uiControl(MainDrawArea), w, h); - if ((cw < w) || (ch < h)) - { - if (cw < w) cw = w; - if (ch < h) ch = h; - uiWindowSetContentSize(MainWindow, cw, ch); - } -} - -void OnAreaResize(uiAreaHandler* handler, uiArea* area, int width, int height) -{ - SetupScreenRects(width, height); - - // TODO: - // should those be the size of the uiArea, or the size of the window client area? - // for now the uiArea fills the whole window anyway - // but... we never know, I guess - WindowWidth = width; - WindowHeight = height; - - int ismax = uiWindowMaximized(MainWindow); - int ismin = uiWindowMinimized(MainWindow); - - Config::WindowMaximized = ismax; - if (!ismax && !ismin) - { - Config::WindowWidth = width; - Config::WindowHeight = height; - } - - OSD::WindowResized(Screen_UseGL); -} - - -void Run() -{ - EmuRunning = 1; - RunningSomething = true; - - SPU::InitOutput(); - AudioSampleFrac = 0; - SDL_PauseAudioDevice(AudioDevice, 0); - SDL_PauseAudioDevice(MicDevice, 0); - - uiMenuItemEnable(MenuItem_SaveState); - uiMenuItemEnable(MenuItem_LoadState); - - if (SavestateLoaded) - uiMenuItemEnable(MenuItem_UndoStateLoad); - else - uiMenuItemDisable(MenuItem_UndoStateLoad); - - for (int i = 0; i < 8; i++) - { - char ssfile[1024]; - GetSavestateName(i+1, ssfile, 1024); - if (Platform::FileExists(ssfile)) uiMenuItemEnable(MenuItem_LoadStateSlot[i]); - else uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - } - - for (int i = 0; i < 9; i++) uiMenuItemEnable(MenuItem_SaveStateSlot[i]); - uiMenuItemEnable(MenuItem_LoadStateSlot[8]); - - uiMenuItemEnable(MenuItem_Pause); - uiMenuItemEnable(MenuItem_Reset); - uiMenuItemEnable(MenuItem_Stop); - uiMenuItemSetChecked(MenuItem_Pause, 0); -} - -void TogglePause(void* blarg) -{ - if (!RunningSomething) return; - - if (EmuRunning == 1) - { - // enable pause - EmuRunning = 2; - uiMenuItemSetChecked(MenuItem_Pause, 1); - - SPU::DrainOutput(); - SDL_PauseAudioDevice(AudioDevice, 1); - SDL_PauseAudioDevice(MicDevice, 1); - - OSD::AddMessage(0, "Paused"); - } - else - { - // disable pause - EmuRunning = 1; - uiMenuItemSetChecked(MenuItem_Pause, 0); - - SPU::InitOutput(); - AudioSampleFrac = 0; - SDL_PauseAudioDevice(AudioDevice, 0); - SDL_PauseAudioDevice(MicDevice, 0); - - OSD::AddMessage(0, "Resumed"); - } -} - -void Reset(void* blarg) -{ - if (!RunningSomething) return; - - EmuRunning = 2; - while (EmuStatus != 2); - - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - if (ROMPath[0][0] == '\0') - NDS::LoadBIOS(); - else - { - SetupSRAMPath(0); - NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot); - } - - if (ROMPath[1][0] != '\0') - { - SetupSRAMPath(1); - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - - Run(); - - OSD::AddMessage(0, "Reset"); -} - -void Stop(bool internal) -{ - EmuRunning = 2; - if (!internal) // if shutting down from the UI thread, wait till the emu thread has stopped - while (EmuStatus != 2); - RunningSomething = false; - - // eject any inserted GBA cartridge - GBACart::Eject(); - ROMPath[1][0] = '\0'; - - uiWindowSetTitle(MainWindow, "melonDS " MELONDS_VERSION); - - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]); - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - uiMenuItemDisable(MenuItem_UndoStateLoad); - - uiMenuItemDisable(MenuItem_Pause); - uiMenuItemDisable(MenuItem_Reset); - uiMenuItemDisable(MenuItem_Stop); - uiMenuItemSetChecked(MenuItem_Pause, 0); - - uiAreaQueueRedrawAll(MainDrawArea); - - SPU::DrainOutput(); - SDL_PauseAudioDevice(AudioDevice, 1); - SDL_PauseAudioDevice(MicDevice, 1); - - OSD::AddMessage(0xFFC040, "Shutdown"); -} - -void SetupSRAMPath(int slot) -{ - strncpy(SRAMPath[slot], ROMPath[slot], 1023); - SRAMPath[slot][1023] = '\0'; - strncpy(SRAMPath[slot] + strlen(ROMPath[slot]) - 3, "sav", 3); -} - -void TryLoadROM(char* file, int slot, int prevstatus) -{ - char oldpath[1024]; - char oldsram[1024]; - strncpy(oldpath, ROMPath[slot], 1024); - strncpy(oldsram, SRAMPath[slot], 1024); - - strncpy(ROMPath[slot], file, 1023); - ROMPath[slot][1023] = '\0'; - - SetupSRAMPath(0); - SetupSRAMPath(1); - - if (slot == 0 && NDS::LoadROM(ROMPath[slot], SRAMPath[slot], Config::DirectBoot)) - { - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - // Reload the inserted GBA cartridge (if any) - if (ROMPath[1][0] != '\0') NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - - strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety - Run(); - } - else if (slot == 1 && NDS::LoadGBAROM(ROMPath[slot], SRAMPath[slot])) - { - SavestateLoaded = false; - uiMenuItemDisable(MenuItem_UndoStateLoad); - - strncpy(PrevSRAMPath[slot], SRAMPath[slot], 1024); // safety - if (RunningSomething) Run(); // do not start just from a GBA cart - } - else - { - uiMsgBoxError(MainWindow, - "Failed to load the ROM", - "Make sure the file can be accessed and isn't opened in another application."); - - strncpy(ROMPath[slot], oldpath, 1024); - strncpy(SRAMPath[slot], oldsram, 1024); - EmuRunning = prevstatus; - } -} - - -// SAVESTATE TODO -// * configurable paths. not everyone wants their ROM directory to be polluted, I guess. - -void GetSavestateName(int slot, char* filename, int len) -{ - int pos; - - if (ROMPath[0][0] == '\0') // running firmware, no ROM - { - strcpy(filename, "firmware"); - pos = 8; - } - else - { - int l = strlen(ROMPath[0]); - pos = l; - while (ROMPath[0][pos] != '.' && pos > 0) pos--; - if (pos == 0) pos = l; - - // avoid buffer overflow. shoddy - if (pos > len-5) pos = len-5; - - strncpy(&filename[0], ROMPath[0], pos); - } - strcpy(&filename[pos], ".ml"); - filename[pos+3] = '0'+slot; - filename[pos+4] = '\0'; -} - -void LoadState(int slot) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char filename[1024]; - - if (slot > 0) - { - GetSavestateName(slot, filename, 1024); - } - else - { - char* file = uiOpenFile(MainWindow, "melonDS savestate (any)|*.ml1;*.ml2;*.ml3;*.ml4;*.ml5;*.ml6;*.ml7;*.ml8;*.mln", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - strncpy(filename, file, 1023); - filename[1023] = '\0'; - uiFreeText(file); - } - - if (!Platform::FileExists(filename)) - { - char msg[64]; - if (slot > 0) sprintf(msg, "State slot %d is empty", slot); - else sprintf(msg, "State file does not exist"); - OSD::AddMessage(0xFFA0A0, msg); - - EmuRunning = prevstatus; - return; - } - - u32 oldGBACartCRC = GBACart::CartCRC; - - // backup - Savestate* backup = new Savestate("timewarp.mln", true); - NDS::DoSavestate(backup); - delete backup; - - bool failed = false; - - Savestate* state = new Savestate(filename, false); - if (state->Error) - { - delete state; - - uiMsgBoxError(MainWindow, "Error", "Could not load savestate file."); - - // current state might be crapoed, so restore from sane backup - state = new Savestate("timewarp.mln", false); - failed = true; - } - - NDS::DoSavestate(state); - delete state; - - if (!failed) - { - if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0') - { - strncpy(PrevSRAMPath[0], SRAMPath[0], 1024); - - strncpy(SRAMPath[0], filename, 1019); - int len = strlen(SRAMPath[0]); - strcpy(&SRAMPath[0][len], ".sav"); - SRAMPath[0][len+4] = '\0'; - - NDS::RelocateSave(SRAMPath[0], false); - } - - bool loadedPartialGBAROM = false; - - // in case we have a GBA cart inserted, and the GBA ROM changes - // due to having loaded a save state, we do not want to reload - // the previous cartridge on reset, or commit writes to any - // loaded save file. therefore, their paths are "nulled". - if (GBACart::CartInserted && GBACart::CartCRC != oldGBACartCRC) - { - ROMPath[1][0] = '\0'; - SRAMPath[1][0] = '\0'; - loadedPartialGBAROM = true; - } - - char msg[64]; - if (slot > 0) sprintf(msg, "State loaded from slot %d%s", - slot, loadedPartialGBAROM ? " (GBA ROM header only)" : ""); - else sprintf(msg, "State loaded from file%s", - loadedPartialGBAROM ? " (GBA ROM header only)" : ""); - OSD::AddMessage(0, msg); - - SavestateLoaded = true; - uiMenuItemEnable(MenuItem_UndoStateLoad); - } - - EmuRunning = prevstatus; -} - -void SaveState(int slot) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char filename[1024]; - - if (slot > 0) - { - GetSavestateName(slot, filename, 1024); - } - else - { - char* file = uiSaveFile(MainWindow, "melonDS savestate (*.mln)|*.mln", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - strncpy(filename, file, 1023); - filename[1023] = '\0'; - uiFreeText(file); - } - - Savestate* state = new Savestate(filename, true); - if (state->Error) - { - delete state; - - uiMsgBoxError(MainWindow, "Error", "Could not save state."); - } - else - { - NDS::DoSavestate(state); - delete state; - - if (slot > 0) - uiMenuItemEnable(MenuItem_LoadStateSlot[slot-1]); - - if (Config::SavestateRelocSRAM && ROMPath[0][0]!='\0') - { - strncpy(SRAMPath[0], filename, 1019); - int len = strlen(SRAMPath[0]); - strcpy(&SRAMPath[0][len], ".sav"); - SRAMPath[0][len+4] = '\0'; - - NDS::RelocateSave(SRAMPath[0], true); - } - } - - char msg[64]; - if (slot > 0) sprintf(msg, "State saved to slot %d", slot); - else sprintf(msg, "State saved to file"); - OSD::AddMessage(0, msg); - - EmuRunning = prevstatus; -} - -void UndoStateLoad() -{ - if (!SavestateLoaded) return; - - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - // pray that this works - // what do we do if it doesn't??? - // but it should work. - Savestate* backup = new Savestate("timewarp.mln", false); - NDS::DoSavestate(backup); - delete backup; - - if (ROMPath[0][0]!='\0') - { - strncpy(SRAMPath[0], PrevSRAMPath[0], 1024); - NDS::RelocateSave(SRAMPath[0], false); - } - - OSD::AddMessage(0, "State load undone"); - - EmuRunning = prevstatus; -} - - -void CloseAllDialogs() -{ - DlgAudioSettings::Close(); - DlgEmuSettings::Close(); - DlgInputConfig::Close(0); - DlgInputConfig::Close(1); - DlgVideoSettings::Close(); - DlgWifiSettings::Close(); -} - - -int OnCloseWindow(uiWindow* window, void* blarg) -{ - EmuRunning = 3; - while (EmuStatus != 3); - - CloseAllDialogs(); - StopEmuThread(); - uiQuit(); - return 1; -} - -void OnDropFile(uiWindow* window, char* file, void* blarg) -{ - char* ext = &file[strlen(file)-3]; - int prevstatus = EmuRunning; - - if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl")) - { - if (RunningSomething) - { - EmuRunning = 2; - while (EmuStatus != 2); - } - - TryLoadROM(file, 0, prevstatus); - } - else if (!strcasecmp(ext, "gba")) - { - TryLoadROM(file, 1, prevstatus); - } -} - -void OnGetFocus(uiWindow* window, void* blarg) -{ - uiControlSetFocus(uiControl(MainDrawArea)); -} - -void OnLoseFocus(uiWindow* window, void* blarg) -{ - // TODO: shit here? -} - -void OnCloseByMenu(uiMenuItem* item, uiWindow* window, void* blarg) -{ - EmuRunning = 3; - while (EmuStatus != 3); - - CloseAllDialogs(); - StopEmuThread(); - DestroyMainWindow(); - uiQuit(); -} - -void OnOpenFile(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int prevstatus = EmuRunning; - EmuRunning = 2; - while (EmuStatus != 2); - - char* file = uiOpenFile(window, "DS ROM (*.nds)|*.nds;*.srl|GBA ROM (*.gba)|*.gba|Any file|*.*", Config::LastROMFolder); - if (!file) - { - EmuRunning = prevstatus; - return; - } - - int pos = strlen(file)-1; - while (file[pos] != '/' && file[pos] != '\\' && pos > 0) pos--; - strncpy(Config::LastROMFolder, file, pos); - Config::LastROMFolder[pos] = '\0'; - char* ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "gba")) - { - TryLoadROM(file, 1, prevstatus); - } - else - { - TryLoadROM(file, 0, prevstatus); - } - - uiFreeText(file); -} - -void OnSaveState(uiMenuItem* item, uiWindow* window, void* param) -{ - int slot = *(int*)param; - SaveState(slot); -} - -void OnLoadState(uiMenuItem* item, uiWindow* window, void* param) -{ - int slot = *(int*)param; - LoadState(slot); -} - -void OnUndoStateLoad(uiMenuItem* item, uiWindow* window, void* param) -{ - UndoStateLoad(); -} - -void OnRun(uiMenuItem* item, uiWindow* window, void* blarg) -{ - if (!RunningSomething) - { - ROMPath[0][0] = '\0'; - NDS::LoadBIOS(); - - if (ROMPath[1][0] != '\0') - { - SetupSRAMPath(1); - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - } - - Run(); -} - -void OnPause(uiMenuItem* item, uiWindow* window, void* blarg) -{ - TogglePause(NULL); -} - -void OnReset(uiMenuItem* item, uiWindow* window, void* blarg) -{ - Reset(NULL); -} - -void OnStop(uiMenuItem* item, uiWindow* window, void* blarg) -{ - if (!RunningSomething) return; - - Stop(false); -} - -void OnOpenEmuSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgEmuSettings::Open(); -} - -void OnOpenInputConfig(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgInputConfig::Open(0); -} - -void OnOpenHotkeyConfig(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgInputConfig::Open(1); -} - -void OnOpenVideoSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgVideoSettings::Open(); -} - -void OnOpenAudioSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgAudioSettings::Open(); -} - -void OnOpenWifiSettings(uiMenuItem* item, uiWindow* window, void* blarg) -{ - DlgWifiSettings::Open(); -} - - -void OnSetSavestateSRAMReloc(uiMenuItem* item, uiWindow* window, void* param) -{ - Config::SavestateRelocSRAM = uiMenuItemChecked(item) ? 1:0; -} - - -void EnsureProperMinSize() -{ - bool isHori = (ScreenRotation == 1 || ScreenRotation == 3); - - int w0 = 256; - int h0 = 192; - int w1 = 256; - int h1 = 192; - - if (ScreenLayout == 0) // natural - { - if (isHori) - SetMinSize(h0+ScreenGap+h1, std::max(w0,w1)); - else - SetMinSize(std::max(w0,w1), h0+ScreenGap+h1); - } - else if (ScreenLayout == 1) // vertical - { - if (isHori) - SetMinSize(std::max(h0,h1), w0+ScreenGap+w1); - else - SetMinSize(std::max(w0,w1), h0+ScreenGap+h1); - } - else // horizontal - { - if (isHori) - SetMinSize(h0+ScreenGap+h1, std::max(w0,w1)); - else - SetMinSize(w0+ScreenGap+w1, std::max(h0,h1)); - } -} - -void OnSetScreenSize(uiMenuItem* item, uiWindow* window, void* param) -{ - int factor = *(int*)param; - bool isHori = (ScreenRotation == 1 || ScreenRotation == 3); - - int w = 256*factor; - int h = 192*factor; - - // FIXME - - if (ScreenLayout == 0) // natural - { - if (isHori) - uiWindowSetContentSize(window, (h*2)+ScreenGap, w); - else - uiWindowSetContentSize(window, w, (h*2)+ScreenGap); - } - else if (ScreenLayout == 1) // vertical - { - if (isHori) - uiWindowSetContentSize(window, h, (w*2)+ScreenGap); - else - uiWindowSetContentSize(window, w, (h*2)+ScreenGap); - } - else // horizontal - { - if (isHori) - uiWindowSetContentSize(window, (h*2)+ScreenGap, w); - else - uiWindowSetContentSize(window, (w*2)+ScreenGap, h); - } -} - -void OnSetScreenRotation(uiMenuItem* item, uiWindow* window, void* param) -{ - int rot = *(int*)param; - - int oldrot = ScreenRotation; - ScreenRotation = rot; - - int w, h; - uiWindowContentSize(window, &w, &h); - - bool isHori = (rot == 1 || rot == 3); - bool wasHori = (oldrot == 1 || oldrot == 3); - - EnsureProperMinSize(); - - if (ScreenLayout == 0) // natural - { - if (isHori ^ wasHori) - { - int blarg = h; - h = w; - w = blarg; - - uiWindowSetContentSize(window, w, h); - } - } - - SetupScreenRects(w, h); - - for (int i = 0; i < 4; i++) - uiMenuItemSetChecked(MenuItem_ScreenRot[i], i==ScreenRotation); -} - -void OnSetScreenGap(uiMenuItem* item, uiWindow* window, void* param) -{ - int gap = *(int*)param; - - //int oldgap = ScreenGap; - ScreenGap = gap; - - EnsureProperMinSize(); - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 6; i++) - uiMenuItemSetChecked(MenuItem_ScreenGap[i], kScreenGap[i]==ScreenGap); -} - -void OnSetScreenLayout(uiMenuItem* item, uiWindow* window, void* param) -{ - int layout = *(int*)param; - ScreenLayout = layout; - - EnsureProperMinSize(); - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 3; i++) - uiMenuItemSetChecked(MenuItem_ScreenLayout[i], i==ScreenLayout); -} - -void OnSetScreenSizing(uiMenuItem* item, uiWindow* window, void* param) -{ - int sizing = *(int*)param; - ScreenSizing = sizing; - - SetupScreenRects(WindowWidth, WindowHeight); - - for (int i = 0; i < 4; i++) - uiMenuItemSetChecked(MenuItem_ScreenSizing[i], i==ScreenSizing); -} - -void OnSetScreenFiltering(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::ScreenFilter = 1; - else Config::ScreenFilter = 0; -} - -void OnSetLimitFPS(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::LimitFPS = true; - else Config::LimitFPS = false; -} - -void OnSetAudioSync(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::AudioSync = true; - else Config::AudioSync = false; -} - -void OnSetShowOSD(uiMenuItem* item, uiWindow* window, void* blarg) -{ - int chk = uiMenuItemChecked(item); - if (chk != 0) Config::ShowOSD = true; - else Config::ShowOSD = false; -} - -void ApplyNewSettings(int type) -{ -#ifdef JIT_ENABLED - if (type == 4) - { - Reset(NULL); - return; - } -#endif - - if (!RunningSomething) - { - if (type == 1) return; - } - - int prevstatus = EmuRunning; - EmuRunning = 3; - while (EmuStatus != 3); - - if (type == 0) // 3D renderer settings - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::UpdateRendererConfig(); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - GL_3DScale = Config::GL_ScaleFactor; // dorp - GL_ScreenSizeDirty = true; - } - else if (type == 1) // wifi settings - { - if (Wifi::MPInited) - { - Platform::MP_DeInit(); - Platform::MP_Init(); - } - - Platform::LAN_DeInit(); - Platform::LAN_Init(); - } - else if (type == 2) // video output method - { - bool usegl = Config::ScreenUseGL || (Config::_3DRenderer != 0); - if (usegl != Screen_UseGL) - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::DeInitRenderer(); - OSD::DeInit(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - - Screen_UseGL = usegl; - RecreateMainWindow(usegl); - - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::InitRenderer(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - } - } - else if (type == 3) // 3D renderer - { - if (Screen_UseGL) uiGLMakeContextCurrent(GLContext); - GPU3D::DeInitRenderer(); - GPU3D::InitRenderer(Screen_UseGL); - if (Screen_UseGL) uiGLMakeContextCurrent(NULL); - } - EmuRunning = prevstatus; -} - - -void CreateMainWindowMenu() -{ - uiMenu* menu; - uiMenuItem* menuitem; - - menu = uiNewMenu("File"); - menuitem = uiMenuAppendItem(menu, "Open ROM..."); - uiMenuItemOnClicked(menuitem, OnOpenFile, NULL); - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Save state"); - - for (int i = 0; i < 9; i++) - { - char name[32]; - if (i < 8) - sprintf(name, "%d\tShift+F%d", kSavestateNum[i], kSavestateNum[i]); - else - strcpy(name, "File...\tShift+F9"); - - uiMenuItem* ssitem = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(ssitem, OnSaveState, (void*)&kSavestateNum[i]); - - MenuItem_SaveStateSlot[i] = ssitem; - } - - MenuItem_SaveState = uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Load state"); - - for (int i = 0; i < 9; i++) - { - char name[32]; - if (i < 8) - sprintf(name, "%d\tF%d", kSavestateNum[i], kSavestateNum[i]); - else - strcpy(name, "File...\tF9"); - - uiMenuItem* ssitem = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(ssitem, OnLoadState, (void*)&kSavestateNum[i]); - - MenuItem_LoadStateSlot[i] = ssitem; - } - - MenuItem_LoadState = uiMenuAppendSubmenu(menu, submenu); - } - menuitem = uiMenuAppendItem(menu, "Undo state load\tF12"); - uiMenuItemOnClicked(menuitem, OnUndoStateLoad, NULL); - MenuItem_UndoStateLoad = menuitem; - uiMenuAppendSeparator(menu); - menuitem = uiMenuAppendItem(menu, "Quit"); - uiMenuItemOnClicked(menuitem, OnCloseByMenu, NULL); - - menu = uiNewMenu("System"); - menuitem = uiMenuAppendItem(menu, "Run"); - uiMenuItemOnClicked(menuitem, OnRun, NULL); - menuitem = uiMenuAppendCheckItem(menu, "Pause"); - uiMenuItemOnClicked(menuitem, OnPause, NULL); - MenuItem_Pause = menuitem; - uiMenuAppendSeparator(menu); - menuitem = uiMenuAppendItem(menu, "Reset"); - uiMenuItemOnClicked(menuitem, OnReset, NULL); - MenuItem_Reset = menuitem; - menuitem = uiMenuAppendItem(menu, "Stop"); - uiMenuItemOnClicked(menuitem, OnStop, NULL); - MenuItem_Stop = menuitem; - - menu = uiNewMenu("Config"); - { - menuitem = uiMenuAppendItem(menu, "Emu settings"); - uiMenuItemOnClicked(menuitem, OnOpenEmuSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Input config"); - uiMenuItemOnClicked(menuitem, OnOpenInputConfig, NULL); - menuitem = uiMenuAppendItem(menu, "Hotkey config"); - uiMenuItemOnClicked(menuitem, OnOpenHotkeyConfig, NULL); - menuitem = uiMenuAppendItem(menu, "Video settings"); - uiMenuItemOnClicked(menuitem, OnOpenVideoSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Audio settings"); - uiMenuItemOnClicked(menuitem, OnOpenAudioSettings, NULL); - menuitem = uiMenuAppendItem(menu, "Wifi settings"); - uiMenuItemOnClicked(menuitem, OnOpenWifiSettings, NULL); - } - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Savestate settings"); - - MenuItem_SavestateSRAMReloc = uiMenuAppendCheckItem(submenu, "Separate savefiles"); - uiMenuItemOnClicked(MenuItem_SavestateSRAMReloc, OnSetSavestateSRAMReloc, NULL); - - uiMenuAppendSubmenu(menu, submenu); - } - uiMenuAppendSeparator(menu); - { - uiMenu* submenu = uiNewMenu("Screen size"); - - for (int i = 0; i < 4; i++) - { - char name[32]; - sprintf(name, "%dx", kScreenSize[i]); - uiMenuItem* item = uiMenuAppendItem(submenu, name); - uiMenuItemOnClicked(item, OnSetScreenSize, (void*)&kScreenSize[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen rotation"); - - for (int i = 0; i < 4; i++) - { - char name[32]; - sprintf(name, "%d", kScreenRot[i]*90); - MenuItem_ScreenRot[i] = uiMenuAppendCheckItem(submenu, name); - uiMenuItemOnClicked(MenuItem_ScreenRot[i], OnSetScreenRotation, (void*)&kScreenRot[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Mid-screen gap"); - - //for (int i = 0; kScreenGap[i] != -1; i++) - for (int i = 0; i < 6; i++) - { - char name[32]; - sprintf(name, "%d pixels", kScreenGap[i]); - MenuItem_ScreenGap[i] = uiMenuAppendCheckItem(submenu, name); - uiMenuItemOnClicked(MenuItem_ScreenGap[i], OnSetScreenGap, (void*)&kScreenGap[i]); - } - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen layout"); - - MenuItem_ScreenLayout[0] = uiMenuAppendCheckItem(submenu, "Natural"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[0], OnSetScreenLayout, (void*)&kScreenLayout[0]); - MenuItem_ScreenLayout[1] = uiMenuAppendCheckItem(submenu, "Vertical"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[1], OnSetScreenLayout, (void*)&kScreenLayout[1]); - MenuItem_ScreenLayout[2] = uiMenuAppendCheckItem(submenu, "Horizontal"); - uiMenuItemOnClicked(MenuItem_ScreenLayout[2], OnSetScreenLayout, (void*)&kScreenLayout[2]); - - uiMenuAppendSubmenu(menu, submenu); - } - { - uiMenu* submenu = uiNewMenu("Screen sizing"); - - MenuItem_ScreenSizing[0] = uiMenuAppendCheckItem(submenu, "Even"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[0], OnSetScreenSizing, (void*)&kScreenSizing[0]); - MenuItem_ScreenSizing[1] = uiMenuAppendCheckItem(submenu, "Emphasize top"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[1], OnSetScreenSizing, (void*)&kScreenSizing[1]); - MenuItem_ScreenSizing[2] = uiMenuAppendCheckItem(submenu, "Emphasize bottom"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[2], OnSetScreenSizing, (void*)&kScreenSizing[2]); - MenuItem_ScreenSizing[3] = uiMenuAppendCheckItem(submenu, "Auto"); - uiMenuItemOnClicked(MenuItem_ScreenSizing[3], OnSetScreenSizing, (void*)&kScreenSizing[3]); - - uiMenuAppendSubmenu(menu, submenu); - } - - MenuItem_ScreenFilter = uiMenuAppendCheckItem(menu, "Screen filtering"); - uiMenuItemOnClicked(MenuItem_ScreenFilter, OnSetScreenFiltering, NULL); - - MenuItem_ShowOSD = uiMenuAppendCheckItem(menu, "Show OSD"); - uiMenuItemOnClicked(MenuItem_ShowOSD, OnSetShowOSD, NULL); - - uiMenuAppendSeparator(menu); - - MenuItem_LimitFPS = uiMenuAppendCheckItem(menu, "Limit framerate"); - uiMenuItemOnClicked(MenuItem_LimitFPS, OnSetLimitFPS, NULL); - - MenuItem_AudioSync = uiMenuAppendCheckItem(menu, "Audio sync"); - uiMenuItemOnClicked(MenuItem_AudioSync, OnSetAudioSync, NULL); -} - -void CreateMainWindow(bool opengl) -{ - MainWindow = uiNewWindow("melonDS " MELONDS_VERSION, - WindowWidth, WindowHeight, - Config::WindowMaximized, 1, 1); - uiWindowOnClosing(MainWindow, OnCloseWindow, NULL); - - uiWindowSetDropTarget(MainWindow, 1); - uiWindowOnDropFile(MainWindow, OnDropFile, NULL); - - uiWindowOnGetFocus(MainWindow, OnGetFocus, NULL); - uiWindowOnLoseFocus(MainWindow, OnLoseFocus, NULL); - - ScreenDrawInited = false; - bool opengl_good = opengl; - - if (!opengl) MainDrawArea = uiNewArea(&MainDrawAreaHandler); - else MainDrawArea = uiNewGLArea(&MainDrawAreaHandler, kGLVersions); - - uiWindowSetChild(MainWindow, uiControl(MainDrawArea)); - uiControlSetMinSize(uiControl(MainDrawArea), 256, 384); - uiAreaSetBackgroundColor(MainDrawArea, 0, 0, 0); - - uiControlShow(uiControl(MainWindow)); - uiControlSetFocus(uiControl(MainDrawArea)); - - if (opengl_good) - { - GLContext = uiAreaGetGLContext(MainDrawArea); - if (!GLContext) opengl_good = false; - } - if (opengl_good) - { - uiGLMakeContextCurrent(GLContext); - uiGLSetVSync(Config::ScreenVSync); - if (!GLScreen_Init()) opengl_good = false; - if (opengl_good) - { - OpenGL_UseShaderProgram(GL_ScreenShaderOSD); - OSD::Init(true); - } - uiGLMakeContextCurrent(NULL); - } - - if (opengl && !opengl_good) - { - printf("OpenGL: initialization failed\n"); - RecreateMainWindow(false); - Screen_UseGL = false; - } - - if (!opengl) OSD::Init(false); -} - -void DestroyMainWindow() -{ - uiControlDestroy(uiControl(MainWindow)); - - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - ScreenBitmap[0] = NULL; - ScreenBitmap[1] = NULL; -} - -void RecreateMainWindow(bool opengl) -{ - int winX, winY, maxi; - uiWindowPosition(MainWindow, &winX, &winY); - maxi = uiWindowMaximized(MainWindow); - DestroyMainWindow(); - CreateMainWindow(opengl); - uiWindowSetPosition(MainWindow, winX, winY); - uiWindowSetMaximized(MainWindow, maxi); -} - - -int main(int argc, char** argv) -{ - srand(time(NULL)); - - printf("melonDS " MELONDS_VERSION "\n"); - printf(MELONDS_URL "\n"); - -#if defined(__WIN32__) || defined(UNIX_PORTABLE) - if (argc > 0 && strlen(argv[0]) > 0) - { - int len = strlen(argv[0]); - while (len > 0) - { - if (argv[0][len] == '/') break; - if (argv[0][len] == '\\') break; - len--; - } - if (len > 0) - { - EmuDirectory = new char[len+1]; - strncpy(EmuDirectory, argv[0], len); - EmuDirectory[len] = '\0'; - } - else - { - EmuDirectory = new char[2]; - strcpy(EmuDirectory, "."); - } - } - else - { - EmuDirectory = new char[2]; - strcpy(EmuDirectory, "."); - } -#else - const char* confdir = g_get_user_config_dir(); - const char* confname = "/melonDS"; - EmuDirectory = new char[strlen(confdir) + strlen(confname) + 1]; - strcat(EmuDirectory, confdir); - strcat(EmuDirectory, confname); -#endif - - // http://stackoverflow.com/questions/14543333/joystick-wont-work-using-sdl - SDL_SetHint(SDL_HINT_JOYSTICK_ALLOW_BACKGROUND_EVENTS, "1"); - - if (SDL_Init(SDL_INIT_HAPTIC) < 0) - { - printf("SDL couldn't init rumble\n"); - } - if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO | SDL_INIT_JOYSTICK) < 0) - { - printf("SDL shat itself :(\n"); - return 1; - } - - SDL_JoystickEventState(SDL_ENABLE); - - uiInitOptions ui_opt; - memset(&ui_opt, 0, sizeof(uiInitOptions)); - const char* ui_err = uiInit(&ui_opt); - if (ui_err != NULL) - { - printf("libui shat itself :( %s\n", ui_err); - uiFreeInitError(ui_err); - return 1; - } - - Config::Load(); - - if (Config::AudioVolume < 0) Config::AudioVolume = 0; - else if (Config::AudioVolume > 256) Config::AudioVolume = 256; - - if (!Platform::LocalFileExists("bios7.bin") || - !Platform::LocalFileExists("bios9.bin") || - !Platform::LocalFileExists("firmware.bin")) - { -#if defined(__WIN32__) || defined(UNIX_PORTABLE) - const char* locationName = "the directory you run melonDS from"; -#else - char* locationName = EmuDirectory; -#endif - char msgboxtext[512]; - sprintf(msgboxtext, - "One or more of the following required files don't exist or couldn't be accessed:\n\n" - "bios7.bin -- ARM7 BIOS\n" - "bios9.bin -- ARM9 BIOS\n" - "firmware.bin -- firmware image\n\n" - "Dump the files from your DS and place them in %s.\n" - "Make sure that the files can be accessed.", - locationName - ); - - uiMsgBoxError(NULL, "BIOS/Firmware not found", msgboxtext); - - uiUninit(); - SDL_Quit(); - return 0; - } - if (!Platform::LocalFileExists("firmware.bin.bak")) - { - // verify the firmware - // - // there are dumps of an old hacked firmware floating around on the internet - // and those are problematic - // the hack predates WFC, and, due to this, any game that alters the WFC - // access point data will brick that firmware due to it having critical - // data in the same area. it has the same problem on hardware. - // - // but this should help stop users from reporting that issue over and over - // again, when the issue is not from melonDS but from their firmware dump. - // - // I don't know about all the firmware hacks in existence, but the one I - // looked at has 0x180 bytes from the header repeated at 0x3FC80, but - // bytes 0x0C-0x14 are different. - - FILE* f = Platform::OpenLocalFile("firmware.bin", "rb"); - u8 chk1[0x180], chk2[0x180]; - - fseek(f, 0, SEEK_SET); - fread(chk1, 1, 0x180, f); - fseek(f, -0x380, SEEK_END); - fread(chk2, 1, 0x180, f); - - memset(&chk1[0x0C], 0, 8); - memset(&chk2[0x0C], 0, 8); - - fclose(f); - - if (!memcmp(chk1, chk2, 0x180)) - { - uiMsgBoxError(NULL, - "Problematic firmware dump", - "You are using an old hacked firmware dump.\n" - "Firmware boot will stop working if you run any game that alters WFC settings.\n\n" - "Note that the issue is not from melonDS, it would also happen on an actual DS."); - } - } - { - const char* romlist_missing = "Save memory type detection will not work correctly.\n\n" - "You should use the latest version of romlist.bin (provided in melonDS release packages)."; -#if !defined(UNIX_PORTABLE) && !defined(__WIN32__) - std::string missingstr = std::string(romlist_missing) + - "\n\nThe ROM list should be placed in " + g_get_user_data_dir() + "/melonds/, otherwise " - "melonDS will search for it in the current working directory."; - const char* romlist_missing_text = missingstr.c_str(); -#else - const char* romlist_missing_text = romlist_missing; -#endif - - FILE* f = Platform::OpenDataFile("romlist.bin"); - if (f) - { - u32 data; - fread(&data, 4, 1, f); - fclose(f); - - if ((data >> 24) == 0) // old CRC-based list - { - uiMsgBoxError(NULL, "Your version of romlist.bin is outdated.", romlist_missing_text); - } - } - else - { - uiMsgBoxError(NULL, "romlist.bin not found.", romlist_missing_text); - } - } - - CreateMainWindowMenu(); - - MainDrawAreaHandler.Draw = OnAreaDraw; - MainDrawAreaHandler.MouseEvent = OnAreaMouseEvent; - MainDrawAreaHandler.MouseCrossed = OnAreaMouseCrossed; - MainDrawAreaHandler.DragBroken = OnAreaDragBroken; - MainDrawAreaHandler.KeyEvent = OnAreaKeyEvent; - MainDrawAreaHandler.Resize = OnAreaResize; - - WindowWidth = Config::WindowWidth; - WindowHeight = Config::WindowHeight; - - Screen_UseGL = Config::ScreenUseGL || (Config::_3DRenderer != 0); - - GL_3DScale = Config::GL_ScaleFactor; - if (GL_3DScale < 1) GL_3DScale = 1; - else if (GL_3DScale > 8) GL_3DScale = 8; - - CreateMainWindow(Screen_UseGL); - - ScreenRotation = Config::ScreenRotation; - ScreenGap = Config::ScreenGap; - ScreenLayout = Config::ScreenLayout; - ScreenSizing = Config::ScreenSizing; - -#define SANITIZE(var, min, max) if ((var < min) || (var > max)) var = 0; - SANITIZE(ScreenRotation, 0, 3); - SANITIZE(ScreenLayout, 0, 2); - SANITIZE(ScreenSizing, 0, 3); -#undef SANITIZE - - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_SaveStateSlot[i]); - for (int i = 0; i < 9; i++) uiMenuItemDisable(MenuItem_LoadStateSlot[i]); - uiMenuItemDisable(MenuItem_UndoStateLoad); - - uiMenuItemDisable(MenuItem_Pause); - uiMenuItemDisable(MenuItem_Reset); - uiMenuItemDisable(MenuItem_Stop); - - uiMenuItemSetChecked(MenuItem_SavestateSRAMReloc, Config::SavestateRelocSRAM?1:0); - - uiMenuItemSetChecked(MenuItem_ScreenRot[ScreenRotation], 1); - uiMenuItemSetChecked(MenuItem_ScreenLayout[ScreenLayout], 1); - uiMenuItemSetChecked(MenuItem_ScreenSizing[ScreenSizing], 1); - - for (int i = 0; i < 6; i++) - { - if (ScreenGap == kScreenGap[i]) - uiMenuItemSetChecked(MenuItem_ScreenGap[i], 1); - } - - OnSetScreenRotation(MenuItem_ScreenRot[ScreenRotation], MainWindow, (void*)&kScreenRot[ScreenRotation]); - - uiMenuItemSetChecked(MenuItem_ScreenFilter, Config::ScreenFilter==1); - uiMenuItemSetChecked(MenuItem_LimitFPS, Config::LimitFPS==1); - uiMenuItemSetChecked(MenuItem_AudioSync, Config::AudioSync==1); - uiMenuItemSetChecked(MenuItem_ShowOSD, Config::ShowOSD==1); - -#ifdef MELONCAP - MelonCap::Init(); -#endif // MELONCAP - - AudioSync = SDL_CreateCond(); - AudioSyncLock = SDL_CreateMutex(); - - AudioFreq = 48000; // TODO: make configurable? - SDL_AudioSpec whatIwant, whatIget; - memset(&whatIwant, 0, sizeof(SDL_AudioSpec)); - whatIwant.freq = AudioFreq; - whatIwant.format = AUDIO_S16LSB; - whatIwant.channels = 2; - whatIwant.samples = 1024; - whatIwant.callback = AudioCallback; - AudioDevice = SDL_OpenAudioDevice(NULL, 0, &whatIwant, &whatIget, SDL_AUDIO_ALLOW_FREQUENCY_CHANGE); - if (!AudioDevice) - { - printf("Audio init failed: %s\n", SDL_GetError()); - } - else - { - AudioFreq = whatIget.freq; - printf("Audio output frequency: %d Hz\n", AudioFreq); - SDL_PauseAudioDevice(AudioDevice, 1); - } - - memset(&whatIwant, 0, sizeof(SDL_AudioSpec)); - whatIwant.freq = 44100; - whatIwant.format = AUDIO_S16LSB; - whatIwant.channels = 1; - whatIwant.samples = 1024; - whatIwant.callback = MicCallback; - MicDevice = SDL_OpenAudioDevice(NULL, 1, &whatIwant, &whatIget, 0); - if (!MicDevice) - { - printf("Mic init failed: %s\n", SDL_GetError()); - MicBufferLength = 0; - } - else - { - SDL_PauseAudioDevice(MicDevice, 1); - } - - memset(MicBuffer, 0, sizeof(MicBuffer)); - MicBufferReadPos = 0; - MicBufferWritePos = 0; - - MicWavBuffer = NULL; - if (Config::MicInputType == 3) MicLoadWav(Config::MicWavPath); - - JoystickID = Config::JoystickID; - Joystick = NULL; - OpenJoystick(); - - EmuRunning = 2; - RunningSomething = false; - EmuThread = SDL_CreateThread(EmuThreadFunc, "melonDS magic", NULL); - - if (argc > 1) - { - char* file = argv[1]; - char* ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "nds") || !strcasecmp(ext, "srl")) - { - strncpy(ROMPath[0], file, 1023); - ROMPath[0][1023] = '\0'; - - SetupSRAMPath(0); - - if (NDS::LoadROM(ROMPath[0], SRAMPath[0], Config::DirectBoot)) - Run(); - } - - if (argc > 2) - { - file = argv[2]; - ext = &file[strlen(file)-3]; - - if (!strcasecmp(ext, "gba")) - { - strncpy(ROMPath[1], file, 1023); - ROMPath[1][1023] = '\0'; - - SetupSRAMPath(1); - - NDS::LoadGBAROM(ROMPath[1], SRAMPath[1]); - } - } - } - - uiMain(); - - if (Joystick) SDL_JoystickClose(Joystick); - if (AudioDevice) SDL_CloseAudioDevice(AudioDevice); - if (MicDevice) SDL_CloseAudioDevice(MicDevice); - - SDL_DestroyCond(AudioSync); - SDL_DestroyMutex(AudioSyncLock); - - if (MicWavBuffer) delete[] MicWavBuffer; - -#ifdef MELONCAP - MelonCap::DeInit(); -#endif // MELONCAP - - if (ScreenBitmap[0]) uiDrawFreeBitmap(ScreenBitmap[0]); - if (ScreenBitmap[1]) uiDrawFreeBitmap(ScreenBitmap[1]); - - Config::ScreenRotation = ScreenRotation; - Config::ScreenGap = ScreenGap; - Config::ScreenLayout = ScreenLayout; - Config::ScreenSizing = ScreenSizing; - - Config::Save(); - - uiUninit(); - SDL_Quit(); - delete[] EmuDirectory; - return 0; -} - -#ifdef __WIN32__ - -#include - -int CALLBACK WinMain(HINSTANCE hinst, HINSTANCE hprev, LPSTR cmdline, int cmdshow) -{ - int argc = 0; - wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc); - char* nullarg = ""; - - char** argv = new char*[argc]; - for (int i = 0; i < argc; i++) - { - int len = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, NULL, 0, NULL, NULL); - if (len < 1) return NULL; - argv[i] = new char[len]; - int res = WideCharToMultiByte(CP_UTF8, 0, argv_w[i], -1, argv[i], len, NULL, NULL); - if (res != len) { delete[] argv[i]; argv[i] = nullarg; } - } - - if (AttachConsole(ATTACH_PARENT_PROCESS)) - { - freopen("CONOUT$", "w", stdout); - freopen("CONOUT$", "w", stderr); - printf("\n"); - } - - int ret = main(argc, argv); - - printf("\n\n>"); - - for (int i = 0; i < argc; i++) if (argv[i] != nullarg) delete[] argv[i]; - delete[] argv; - - return ret; -} - -#endif -- cgit v1.2.3